TransWikia.com

Image scraping from google using Python

Stack Overflow Asked by ss_0708 on December 25, 2021

I am trying to scrape first 3 images for a list of search strings from google. As i am not very well versed with Python, I have taken help from few sources & could write the below code. I need the images to be saved in respective folders(bearing the name of search string) in current Working directory. The code does create folder, but only for the first search string & that too an empty folder.

It is exiting with below error :-

NameError: name 'time' is not defined

The code is :-

def fetch_image_urls(query:str, max_links_to_fetch:int, wd:webdriver, sleep_between_interactions:int=3):
    search_url = f"https://www.google.com/search?q={query}&tbm=isch&ved=2ahUKEwjdh7KtreXqAhX8zIsBHbGHDGkQ2-cCegQIABAA&oq={query}&gs_lcp=CgNpbWcQA1CntRBYp7UQYNe7EGgAcAB4AIABywGIAcsBkgEDMi0xmAEAoAEBqgELZ3dzLXdpei1pbWfAAQE&sclient=img&ei=MYwaX52rF_yZr7wPsY-yyAY&bih=578&biw=1280"
    wd.get(search_url)
    time.sleep(sleep_between_interactions)
    image_urls = set()
    image_count = 0
    number_results = 0
    for i in range(1,20):
        time.sleep(5)
        thumb = wd.find_elements_by_css_selector("img.Q4LuWd")
        time.sleep(5)
        for img in thumb:
            print(img)
            print(img.get_attribute('src'))
            image_urls.add(img.get_attribute('src'))
            image_count = len(image_urls)
            number_results = image_count
            time.sleep(.5)
        print(f"Found: {number_results} search results. Extracting links...")
return image_urls

def persist_image(folder_path:str,url:str):
    try:
        headers = {'User-agent': 'Chrome/84.0.4147.89'}
        image_content = requests.get(url, headers=headers).content
        
    except Exception as e:
        print(f"ERROR - Could not download {url} - {e}")
    try:
        image_file = io.BytesIO(image_content)
        image = Image.open(image_file).convert('RGB')
        file_path = os.path.join(folder_path,hashlib.sha1(image_content).hexdigest()[:10] + '.jpg')
        with open(file_path, 'wb') as f:
            image.save(f, "JPEG", quality=85)
        print(f"SUCCESS - saved {url} - as {file_path}")
    except Exception as e:
        print(f"ERROR - Could not save {url} - {e}")

def search_and_download(search_term:str,driver_path:str,target_path='./images',number_images=5):
    target_folder = os.path.join(target_path,'_'.join(search_term.lower().split(' ')))
    if not os.path.exists(target_folder):
        os.makedirs(target_folder)
    with webdriver.Chrome(executable_path=driver_path) as wd:
        res = fetch_image_urls(search_term, number_images, wd=wd, sleep_between_interactions=3)
        for elem in res:
            persist_image(target_folder,elem)

import requests
import os
import io
from PIL import Image
import hashlib

search_terms = ['1415 Bush St', '2015 Washington Blvd', '1420 Joh Ave', '901 W Ostend St']

for search_term in search_terms: 
    search_and_download(search_term=search_term, driver_path=DRIVER_PATH)

Please suggest the correction/edits to produce desired output,

Thanks !!

2 Answers

I have resolved the issue with a workaround. Below code works exactly as per the requirement. Posting it for future help to anyone having similar requirement.

Thanks all for any help !!

from selenium import webdriver
from selenium.webdriver.common.keys import Keys 
import os
import urllib.request

searchterm = ['3500 Boston St','1415 Bush St','1811 POrtal St']
for i in searchterm:
    url = "https://www.google.co.in/search?q="+i+"&source=lnms&tbm=isch"
    browser = webdriver.Chrome(r'C:UsersXXXXxxxxDocumentschromedriver.exe')
    browser.get(url)
    header={'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36"}

    counter = 0
    succounter = 0
    if not os.path.exists(i):
        os.mkdir(i)

    
    for x in browser.find_elements_by_xpath('//img[contains(@class,"rg_i Q4LuWd")]'):
        counter = counter + 1
        print("Total Count:", counter)
        print("Succsessful Count:", succounter)
        print("URL:", x.get_attribute('src'))

        img = x.get_attribute('src')
        new_filename = i+" " +str(counter)+".jpg"

        try:
            path = os.path.join(i , i + "_" + str(counter))
            path += new_filename
            urllib.request.urlretrieve(img, path)
            succounter += 1
        except Exception as e:
            print(e)

print(succounter, "pictures succesfully downloaded")
browser.close()

Answered by ss_0708 on December 25, 2021

import time

You need to import time for it to work.

Answered by AaronS on December 25, 2021

Add your own answers!

Ask a Question

Get help from others!

© 2024 TransWikia.com. All rights reserved. Sites we Love: PCI Database, UKBizDB, Menu Kuliner, Sharing RPP