utils/wiki_downloader.py

import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
import requests
import os
import time
import base64
from PIL import Image
from io import BytesIO

def download_image(query, download_path):
    options = uc.ChromeOptions()
    options.add_argument('--no-sandbox')
    driver = uc.Chrome(options=options)

    try:
        driver.get(f"https://www.google.com/search?site=&tbm=isch&source=hp&biw=1873&bih=99&q=site:wikipedia.org+{query.replace(' ', '+')}")
        time.sleep(2)

        tos = driver.find_elements(By.CLASS_NAME, "VfPpkd-vQzf8d")
        for to in tos:
            if to.text.lower() == "tout refuser":
                to.click()
                break
        time.sleep(1)
        image = driver.find_element(By.CLASS_NAME, "rg_i")
        image.click()
        time.sleep(5)
        image = driver.find_element(By.CLASS_NAME, "r48jcc").get_attribute("src") or ""
        
        image_content = None

        if image.startswith("data:"):
            image_content = base64.b64decode(image.split(",")[1])
        else:
            response = requests.get(image, stream=True)
            response.raise_for_status()
            image_content = response.content

        # Convert all images to PNG format using PIL, if they aren't already
        img = Image.open(BytesIO(image_content))
        if img.mode in ("RGBA", "P"):
            img = img.convert("RGB")

        img.save(download_path)  # download_path already contains the output filename

        print('Image downloaded successfully at ', download_path)
        
        driver.quit()

    except Exception as e:
        print(f"An error occurred: {e}")
        driver.quit()

if __name__ == "__main__":
    #test
    download_image("test", os.path.join(os.getcwd(), "test.png"))
feat(wiki_downloader.py): add script to download images from Wikipedia This commit adds a new script called `wiki_downloader.py` which allows downloading images from Wikipedia. The script uses Selenium and undetected_chromedriver to automate the process of searching for an image on Google and downloading it. The `download_image` function takes two parameters: `query` and `download_path`. It opens a headless Chrome browser using undetected_chromedriver and navigates to the Google Images search page with the specified query. After a short delay, it clicks on the "tout refuser" button to accept the cookies prompt. Then, it finds the first image in the search results and extracts its source URL. The image is then downloaded either by decoding a base64-encoded image or by making a request to the image URL. The downloaded image is saved in the specified `download_path`. The script also includes a test case that demonstrates how to use the `download_image 2023-07-01 21:46:19 +02:00			`import undetected_chromedriver as uc`
			`from selenium.webdriver.common.by import By`
			`import requests`
			`import os`
			`import time`
			`import base64`
			`from PIL import Image`
			`from io import BytesIO`

			`def download_image(query, download_path):`
			`options = uc.ChromeOptions()`
			`options.add_argument('--no-sandbox')`
			`driver = uc.Chrome(options=options)`

			`try:`
feat(audio_prompts): add default audio prompts for narrator feat(audio_prompts): add en_narrator_deep audio prompt for narrator feat(audio_prompts): add en_narrator_light_bg audio prompt for narrator fix(video.py): fix indentation and add prompt for generating thumbnail fix(montage.py): fix indentation and add prompt for generating thumbnail fix(montage.py): fix image download for wikimage slides fix(speak.py): remove unused import statement fix(speak.py): remove unused variable 'fakenames' feat(speak.py): add function 'remove_blank_moments' to remove silent parts from audio file feat(speak.py): add function 'optimize_string_groups' to optimize string groups for audio generation fix(speak.py): fix comment indentation in 'generate_voice' function fix(speak.py): remove unused imports in 'generate_voice' function fix(speak.py): remove unused variable 'reduced_noise' in 'generate_voice' function fix(speak.py): remove unused import statements in 'generate_voice' function fix(speak.py): remove unused import statement for 'logging' module fix(speak.py): remove unused print statements in 'main' function fix(speak.py): remove unused import statement for 'logging' module fix(speak.py): remove unused print statements in 'main' function fix(speak.py): fix(wiki_downloader.py): fix Google search URL to include correct query parameter fix(wiki_downloader.py): reduce sleep time after page load to 1 second fix(wiki_downloader.py): increase sleep time after image click to 5 seconds 2023-07-02 11:17:10 +02:00			`driver.get(f"https://www.google.com/search?site=&tbm=isch&source=hp&biw=1873&bih=99&q=site:wikipedia.org+{query.replace(' ', '+')}")`
feat(wiki_downloader.py): add script to download images from Wikipedia This commit adds a new script called `wiki_downloader.py` which allows downloading images from Wikipedia. The script uses Selenium and undetected_chromedriver to automate the process of searching for an image on Google and downloading it. The `download_image` function takes two parameters: `query` and `download_path`. It opens a headless Chrome browser using undetected_chromedriver and navigates to the Google Images search page with the specified query. After a short delay, it clicks on the "tout refuser" button to accept the cookies prompt. Then, it finds the first image in the search results and extracts its source URL. The image is then downloaded either by decoding a base64-encoded image or by making a request to the image URL. The downloaded image is saved in the specified `download_path`. The script also includes a test case that demonstrates how to use the `download_image 2023-07-01 21:46:19 +02:00			`time.sleep(2)`

			`tos = driver.find_elements(By.CLASS_NAME, "VfPpkd-vQzf8d")`
			`for to in tos:`
			`if to.text.lower() == "tout refuser":`
			`to.click()`
			`break`
feat(audio_prompts): add default audio prompts for narrator feat(audio_prompts): add en_narrator_deep audio prompt for narrator feat(audio_prompts): add en_narrator_light_bg audio prompt for narrator fix(video.py): fix indentation and add prompt for generating thumbnail fix(montage.py): fix indentation and add prompt for generating thumbnail fix(montage.py): fix image download for wikimage slides fix(speak.py): remove unused import statement fix(speak.py): remove unused variable 'fakenames' feat(speak.py): add function 'remove_blank_moments' to remove silent parts from audio file feat(speak.py): add function 'optimize_string_groups' to optimize string groups for audio generation fix(speak.py): fix comment indentation in 'generate_voice' function fix(speak.py): remove unused imports in 'generate_voice' function fix(speak.py): remove unused variable 'reduced_noise' in 'generate_voice' function fix(speak.py): remove unused import statements in 'generate_voice' function fix(speak.py): remove unused import statement for 'logging' module fix(speak.py): remove unused print statements in 'main' function fix(speak.py): remove unused import statement for 'logging' module fix(speak.py): remove unused print statements in 'main' function fix(speak.py): fix(wiki_downloader.py): fix Google search URL to include correct query parameter fix(wiki_downloader.py): reduce sleep time after page load to 1 second fix(wiki_downloader.py): increase sleep time after image click to 5 seconds 2023-07-02 11:17:10 +02:00			`time.sleep(1)`
feat(wiki_downloader.py): add script to download images from Wikipedia This commit adds a new script called `wiki_downloader.py` which allows downloading images from Wikipedia. The script uses Selenium and undetected_chromedriver to automate the process of searching for an image on Google and downloading it. The `download_image` function takes two parameters: `query` and `download_path`. It opens a headless Chrome browser using undetected_chromedriver and navigates to the Google Images search page with the specified query. After a short delay, it clicks on the "tout refuser" button to accept the cookies prompt. Then, it finds the first image in the search results and extracts its source URL. The image is then downloaded either by decoding a base64-encoded image or by making a request to the image URL. The downloaded image is saved in the specified `download_path`. The script also includes a test case that demonstrates how to use the `download_image 2023-07-01 21:46:19 +02:00			`image = driver.find_element(By.CLASS_NAME, "rg_i")`
			`image.click()`
feat(audio_prompts): add default audio prompts for narrator feat(audio_prompts): add en_narrator_deep audio prompt for narrator feat(audio_prompts): add en_narrator_light_bg audio prompt for narrator fix(video.py): fix indentation and add prompt for generating thumbnail fix(montage.py): fix indentation and add prompt for generating thumbnail fix(montage.py): fix image download for wikimage slides fix(speak.py): remove unused import statement fix(speak.py): remove unused variable 'fakenames' feat(speak.py): add function 'remove_blank_moments' to remove silent parts from audio file feat(speak.py): add function 'optimize_string_groups' to optimize string groups for audio generation fix(speak.py): fix comment indentation in 'generate_voice' function fix(speak.py): remove unused imports in 'generate_voice' function fix(speak.py): remove unused variable 'reduced_noise' in 'generate_voice' function fix(speak.py): remove unused import statements in 'generate_voice' function fix(speak.py): remove unused import statement for 'logging' module fix(speak.py): remove unused print statements in 'main' function fix(speak.py): remove unused import statement for 'logging' module fix(speak.py): remove unused print statements in 'main' function fix(speak.py): fix(wiki_downloader.py): fix Google search URL to include correct query parameter fix(wiki_downloader.py): reduce sleep time after page load to 1 second fix(wiki_downloader.py): increase sleep time after image click to 5 seconds 2023-07-02 11:17:10 +02:00			`time.sleep(5)`
feat(wiki_downloader.py): add script to download images from Wikipedia This commit adds a new script called `wiki_downloader.py` which allows downloading images from Wikipedia. The script uses Selenium and undetected_chromedriver to automate the process of searching for an image on Google and downloading it. The `download_image` function takes two parameters: `query` and `download_path`. It opens a headless Chrome browser using undetected_chromedriver and navigates to the Google Images search page with the specified query. After a short delay, it clicks on the "tout refuser" button to accept the cookies prompt. Then, it finds the first image in the search results and extracts its source URL. The image is then downloaded either by decoding a base64-encoded image or by making a request to the image URL. The downloaded image is saved in the specified `download_path`. The script also includes a test case that demonstrates how to use the `download_image 2023-07-01 21:46:19 +02:00			`image = driver.find_element(By.CLASS_NAME, "r48jcc").get_attribute("src") or ""`

			`image_content = None`

			`if image.startswith("data:"):`
			`image_content = base64.b64decode(image.split(",")[1])`
			`else:`
			`response = requests.get(image, stream=True)`
			`response.raise_for_status()`
			`image_content = response.content`

			`# Convert all images to PNG format using PIL, if they aren't already`
			`img = Image.open(BytesIO(image_content))`
			`if img.mode in ("RGBA", "P"):`
			`img = img.convert("RGB")`

			`img.save(download_path) # download_path already contains the output filename`

			`print('Image downloaded successfully at ', download_path)`

			`driver.quit()`

			`except Exception as e:`
			`print(f"An error occurred: {e}")`
			`driver.quit()`

			`if __name__ == "__main__":`
			`#test`
			`download_image("test", os.path.join(os.getcwd(), "test.png"))`