feat(wiki_downloader.py): add script to download images from Wikipedia

This commit adds a new script called `wiki_downloader.py` which allows downloading images from Wikipedia. The script uses Selenium and undetected_chromedriver to automate the process of searching for an image on Google and downloading it. The `download_image` function takes two parameters: `query` and `download_path`. It opens a headless Chrome browser using undetected_chromedriver and navigates to the Google Images search page with the specified query. After a short delay, it clicks on the "tout refuser" button to accept the cookies prompt. Then, it finds the first image in the search results and extracts its source URL. The image is then downloaded either by decoding a base64-encoded image or by making a request to the image URL. The downloaded image is saved in the specified `download_path`. The script also includes a test case that demonstrates how to use the `download_image
2026-01-02 09:16:20 +00:00 · 2023-07-01 21:46:19 +02:00
parent 866f1b4138
commit 6f9ad7f025
1 changed files with 57 additions and 0 deletions
--- a/utils/wiki_downloader.py
+++ b/utils/wiki_downloader.py
@@ -0,0 +1,57 @@
+import undetected_chromedriver as uc
+from selenium.webdriver.common.by import By
+import requests
+import os
+import time
+import base64
+from PIL import Image
+from io import BytesIO
+
+def download_image(query, download_path):
+    options = uc.ChromeOptions()
+    options.add_argument('--no-sandbox')
+    driver = uc.Chrome(options=options)
+
+    try:
+        driver.get(f"https://www.google.com/search?site=&tbm=isch&source=hp&biw=1873&bih=990&tbs=isz:l&q=site:wikipedia.org+{query.replace(' ', '+')}")
+        time.sleep(2)
+
+        tos = driver.find_elements(By.CLASS_NAME, "VfPpkd-vQzf8d")
+        for to in tos:
+            if to.text.lower() == "tout refuser":
+                to.click()
+                break
+
+        time.sleep(10)
+        image = driver.find_element(By.CLASS_NAME, "rg_i")
+        image.click()
+        time.sleep(2)
+        image = driver.find_element(By.CLASS_NAME, "r48jcc").get_attribute("src") or ""
+        
+        image_content = None
+
+        if image.startswith("data:"):
+            image_content = base64.b64decode(image.split(",")[1])
+        else:
+            response = requests.get(image, stream=True)
+            response.raise_for_status()
+            image_content = response.content
+
+        # Convert all images to PNG format using PIL, if they aren't already
+        img = Image.open(BytesIO(image_content))
+        if img.mode in ("RGBA", "P"):
+            img = img.convert("RGB")
+
+        img.save(download_path)  # download_path already contains the output filename
+
+        print('Image downloaded successfully at ', download_path)
+        
+        driver.quit()
+
+    except Exception as e:
+        print(f"An error occurred: {e}")
+        driver.quit()
+
+if __name__ == "__main__":
+    #test
+    download_image("test", os.path.join(os.getcwd(), "test.png"))