From 6f9ad7f025fafe59eed820a15e31a38689a5f166 Mon Sep 17 00:00:00 2001 From: Paillat Date: Sat, 1 Jul 2023 21:46:19 +0200 Subject: [PATCH] feat(wiki_downloader.py): add script to download images from Wikipedia This commit adds a new script called `wiki_downloader.py` which allows downloading images from Wikipedia. The script uses Selenium and undetected_chromedriver to automate the process of searching for an image on Google and downloading it. The `download_image` function takes two parameters: `query` and `download_path`. It opens a headless Chrome browser using undetected_chromedriver and navigates to the Google Images search page with the specified query. After a short delay, it clicks on the "tout refuser" button to accept the cookies prompt. Then, it finds the first image in the search results and extracts its source URL. The image is then downloaded either by decoding a base64-encoded image or by making a request to the image URL. The downloaded image is saved in the specified `download_path`. The script also includes a test case that demonstrates how to use the `download_image --- utils/wiki_downloader.py | 57 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100644 utils/wiki_downloader.py diff --git a/utils/wiki_downloader.py b/utils/wiki_downloader.py new file mode 100644 index 0000000..ab07663 --- /dev/null +++ b/utils/wiki_downloader.py @@ -0,0 +1,57 @@ +import undetected_chromedriver as uc +from selenium.webdriver.common.by import By +import requests +import os +import time +import base64 +from PIL import Image +from io import BytesIO + +def download_image(query, download_path): + options = uc.ChromeOptions() + options.add_argument('--no-sandbox') + driver = uc.Chrome(options=options) + + try: + driver.get(f"https://www.google.com/search?site=&tbm=isch&source=hp&biw=1873&bih=990&tbs=isz:l&q=site:wikipedia.org+{query.replace(' ', '+')}") + time.sleep(2) + + tos = driver.find_elements(By.CLASS_NAME, "VfPpkd-vQzf8d") + for to in tos: + if to.text.lower() == "tout refuser": + to.click() + break + + time.sleep(10) + image = driver.find_element(By.CLASS_NAME, "rg_i") + image.click() + time.sleep(2) + image = driver.find_element(By.CLASS_NAME, "r48jcc").get_attribute("src") or "" + + image_content = None + + if image.startswith("data:"): + image_content = base64.b64decode(image.split(",")[1]) + else: + response = requests.get(image, stream=True) + response.raise_for_status() + image_content = response.content + + # Convert all images to PNG format using PIL, if they aren't already + img = Image.open(BytesIO(image_content)) + if img.mode in ("RGBA", "P"): + img = img.convert("RGB") + + img.save(download_path) # download_path already contains the output filename + + print('Image downloaded successfully at ', download_path) + + driver.quit() + + except Exception as e: + print(f"An error occurred: {e}") + driver.quit() + +if __name__ == "__main__": + #test + download_image("test", os.path.join(os.getcwd(), "test.png"))