Files
FABLE/utils/wiki_downloader.py
Paillat 6f9ad7f025 feat(wiki_downloader.py): add script to download images from Wikipedia
This commit adds a new script called `wiki_downloader.py` which allows downloading images from Wikipedia. The script uses Selenium and undetected_chromedriver to automate the process of searching for an image on Google and downloading it.

The `download_image` function takes two parameters: `query` and `download_path`. It opens a headless Chrome browser using undetected_chromedriver and navigates to the Google Images search page with the specified query. After a short delay, it clicks on the "tout refuser" button to accept the cookies prompt. Then, it finds the first image in the search results and extracts its source URL. The image is then downloaded either by decoding a base64-encoded image or by making a request to the image URL. The downloaded image is saved in the specified `download_path`.

The script also includes a test case that demonstrates how to use the `download_image
2023-07-01 21:46:19 +02:00

58 lines
1.8 KiB
Python

import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
import requests
import os
import time
import base64
from PIL import Image
from io import BytesIO
def download_image(query, download_path):
options = uc.ChromeOptions()
options.add_argument('--no-sandbox')
driver = uc.Chrome(options=options)
try:
driver.get(f"https://www.google.com/search?site=&tbm=isch&source=hp&biw=1873&bih=990&tbs=isz:l&q=site:wikipedia.org+{query.replace(' ', '+')}")
time.sleep(2)
tos = driver.find_elements(By.CLASS_NAME, "VfPpkd-vQzf8d")
for to in tos:
if to.text.lower() == "tout refuser":
to.click()
break
time.sleep(10)
image = driver.find_element(By.CLASS_NAME, "rg_i")
image.click()
time.sleep(2)
image = driver.find_element(By.CLASS_NAME, "r48jcc").get_attribute("src") or ""
image_content = None
if image.startswith("data:"):
image_content = base64.b64decode(image.split(",")[1])
else:
response = requests.get(image, stream=True)
response.raise_for_status()
image_content = response.content
# Convert all images to PNG format using PIL, if they aren't already
img = Image.open(BytesIO(image_content))
if img.mode in ("RGBA", "P"):
img = img.convert("RGB")
img.save(download_path) # download_path already contains the output filename
print('Image downloaded successfully at ', download_path)
driver.quit()
except Exception as e:
print(f"An error occurred: {e}")
driver.quit()
if __name__ == "__main__":
#test
download_image("test", os.path.join(os.getcwd(), "test.png"))