Quantcast
Channel: Active questions tagged python - Stack Overflow
Viewing all articles
Browse latest Browse all 23276

Using Selenium to make Amazon Product Info Scraper

$
0
0

I am new to all these, but i want to make an amazon product detail scrapper with python.

I made some searches and find a code . I also made some improvements (such as div, span names etc.) and this is my last code. But, i have a problem.

one example of xlsx

import requestsfrom bs4 import BeautifulSoupimport reimport jsonimport pandas as pdfrom faker import Fakerfrom tkinter import *import time result = []def get_soup_retry(url):    fake = Faker()    uag_random = fake.user_agent()    header = {'User-Agent': uag_random,'Accept-Language': 'en-US,en;q=0.9'    }    isCaptcha = True    while isCaptcha:        page = requests.get(url, headers=header, timeout=5)        soup = BeautifulSoup(page.content, 'lxml')        if 'captcha' in str(soup):            uag_random = fake.user_agent()              print(f'\rBot has been detected... retrying ... use new identity: {uag_random} ', end='', flush=True)            for i in [ .5, .1, 1, 2, 3, 4, 5 ]:                print("Waiting for %s" % i , end='')                print(" seconds")                time.sleep(i)            continue        else:            print('Bot bypassed')            return soupdef get_detail(url):    soup = get_soup_retry(url)    b = soup.find("td", attr= {"class": "a-color-secondary a-size-base prodDetSectionEntry "})    try:                   title = soup.find('span', attrs={'id': 'productTitle'}).find()  # to get the text, and strip is used to remove all the leading and trailing spaces from a string.    except AttributeError:        title = ''    try:        discount_percent = soup.find('div', attrs={'class': 'a-section a-spacing-none aok-align-center aok-relative'}).find('span', attrs={'class': 'a-size-large a-color-price savingPriceOverride aok-align-center reinventPriceSavingsPercentageMargin savingsPercentage'}).string.strip()    except AttributeError:        discount_percent = ''    try:        current_price = soup.find('div', attrs={'id': 'tp_price_row_ww'}).find('span', attrs={'id': 'tp_price_block_total_price_ww'}).find ("span", attrs={"class": "a-offscreen"}).string.strip()    except AttributeError:                current_price = ''    try:        review_count = soup.find('span', attrs={'id': 'acrCustomerReviewText'}).text.strip()    except AttributeError:        review_count = ''    try:        available_stock = soup.find('div', attrs={'id': 'availability'}).find('span').text.strip()    except AttributeError:        available_stock = ''    try:        asin = soup.find(id='averageCustomerReviews').get('data-asin')    except AttributeError:        asin = url.split('/dp/')[1].replace('/', '')    try:        rating = soup.find('span', attrs={'data-hook': 'rating-out-of-text'}).text.strip()    except AttributeError:        rating = ''    try:        xy = soup.select_one('th:-soup-contains("Best Sellers Rank") + td').text.split()[-0]    except AttributeError:        xy = ''    try:        bestseller = soup.find('div', attrs={'class': 'zg-badge-wrapper'}).find("a",attrs={"class": "badge-link"}).find("span",attrs={"class": "cat-name"}).find("span",attrs={"class": "cat-link"}).text.strip()    except AttributeError:        bestseller = ''    try:        wholeprice = soup.find('div', attrs={'class': 'a-spacing-top-mini'}).find("span", attrs= {"class": "a-offscreen"}).text.strip()     except AttributeError:        wholeprice = ''    try:        choice = soup.find('div', attrs={'class': 'ac-badge-wrapper'}).find("span", attrs= {"class": "ac-badge-text-primary ac-white"}).text.strip()     except AttributeError:        choice = ''    try:        fulfilled = soup.find('div', attrs={'class': 'offer-display-feature-text a-spacing-none'}).find("span", attrs= {"class": "a-size-small offer-display-feature-text-message"}).text.strip()     except AttributeError:            fulfilled = ''    goal = {'asin': asin,'fiyat': current_price,'rating': rating,'review': review_count,'stok': available_stock,'indirim yuzdesi': discount_percent,'bsr': xy,'bestseller':bestseller,'Amazons choice': choice,        'fulfilled by' : fulfilled    }    print(goal)    result.append(goal)    return resultdef search_keyword(keyword):    count_page = 0    count_asin = 0    while True:        count_page += 1        url = f'https://www.amazon.com/s?k={keyword}&page={count_page}'        print(f'Getting page: {count_page} | {url}')        soup = get_soup_retry(url)        try:            result = soup.find('div', attrs={'class': 's-main-slot s-result-list s-search-results sg-row'}).find_all('div', attrs={'data-component-type': 's-search-result'})        except AttributeError:            continue        for ids in result:            count_asin += 1            asin = ids['data-asin']            url_product = f'https://www.amazon.com/dp/{asin}'            print(f'{count_asin}. {url_product}')            list_result = get_detail(url_product)            df = pd.DataFrame(list_result)            keyword = str(keyword).replace('', '+')            # df.to_csv(f'result_amzn_{keyword}.csv', index=False)            df.to_excel(f'result_amzn_USA_{keyword}.xlsx', index=False)            # if count_asin % 50 == 0:            #     sleep = random.randint(3, 9)            #     print(f'calm down {sleep} seconds')            #   time.sleep(sleep)            #  bestseller = soup.find('div', attrs={'class': 'zg-badge-wrapper'}).find("a",attrs={"class": "badge-link"}).find("span",attrs={"class": "cat-name"}).find("span",attrs={"class": "cat-link"}).text.strip()        last_page = soup.find('li', {'class': 'a-disabled a-last'})        if not last_page:            pass        else:            break

In the Amazon.com website, as u all know, you should enter a US postcode or enter your "Country"

I want the "select" the US-Postcode (for example 10008) to see all products shipping for 10008.

Chatgpt recommended using "Selenium" and send me a "revised" code but its not working. I mean, theoutputs are not correct, code is ok.

Can you help me with selenium? Or can you give me advice?

Thanks in advance.

(I saw a thread here that recommending use "aiohttp" but im not sure about that. I want the scrapper both USA and JP, even for the SG but can i find the right proxies for that countries? Also what about "Faker"'s user agents?

Also, here's my last code to run the py files above.

import sysimport USAmain as amznwhile True:    print('===========  ==========')    print('=========== USA  ==========')    print('| 1. Keyword Girin       |')    print('| 2. Çıkış                      |')    print('=================================')    menu = input('Type 1/2: ')    if menu == '1':        kwd = input('Keyword Girin: ')        print ("Kategoriler:")        print ("Automotive=                    'automotive'")        print ("Home & Kitchen=                'garden'")        print ("Baby=                          'baby-products'")        print ("Beauty & Personal care=        'beauty'")        print ("Electronics=                   'electronics'")        print ("Health & Household=            'hpc'")        print ("Pet Supplies=                  'pets'")        print ("Industrial & Scientific=       'industrial'")        print ("Sports & Outdoors=             'sporting'")        print ("Tools & Home Improvement=      'tools'")        print ("Toys & Games=                  'toys-and-games'")        print ("Appliances =                   'appliances'")        print ("Cell Phones & Accessories=     'mobile'")        print ("Clothing, Shoes & Jewelry=     'fashion'")        print ("Office Products=               'office-products'")        kw = kwd.replace('', '+')        category = input("Yukarıdaki Kategorilerden İstediğinizi Girin:" )        a = kw + ("&i=") + category        amzn.search_keyword(a)        back = input('\nback to menu? y/n: ')        if back == 'y':            continue        else:            break    else:        print('see u')        sys.exit()

I tried using Selenium with the help of "chatgpt" but its not working as intended.

Chatgpt codes

import requestsfrom bs4 import BeautifulSoupimport reimport jsonimport pandas as pdfrom faker import Fakerfrom selenium import webdriverfrom selenium.webdriver.chrome.service import Servicefrom selenium.webdriver.common.by import Byfrom selenium.webdriver.common.keys import Keysfrom selenium.webdriver.support.ui import WebDriverWaitfrom selenium.webdriver.support import expected_conditions as ECimport timeimport osresult = []def get_soup_retry(url):    fake = Faker()    uag_random = fake.user_agent()    header = {'User-Agent': uag_random,'Accept-Language': 'en-US,en;q=0.9'    }    isCaptcha = True    while isCaptcha:        page = requests.get(url, headers=header, timeout=5)        soup = BeautifulSoup(page.content, 'lxml')        if 'captcha' in str(soup):            uag_random = fake.user_agent()            print(f'\rBot has been detected... retrying ... use new identity: {uag_random} ', end='', flush=True)            for i in [0.5, 1, 1.5, 2, 2.5, 3, 3.5, 4, 4.5, 5]:                print("Waiting for %s seconds" % i)                time.sleep(i)            continue        else:            print('Bot bypassed')            return soupdef get_detail(url):    soup = get_soup_retry(url)    b = soup.find("td", attr= {"class": "a-color-secondary a-size-base prodDetSectionEntry "})    try:                   title = soup.find('span', attrs={'id': 'productTitle'}).find()  # to get the text, and strip is used to remove all the leading and trailing spaces from a string.    except AttributeError:        title = ''    try:        discount_percent = soup.find('div', attrs={'class': 'a-section a-spacing-none aok-align-center aok-relative'}).find('span', attrs={'class': 'a-size-large a-color-price savingPriceOverride aok-align-center reinventPriceSavingsPercentageMargin savingsPercentage'}).string.strip()    except AttributeError:        discount_percent = ''    try:        current_price = soup.find('div', attrs={'id': 'tp_price_row_ww'}).find('span', attrs={'id': 'tp_price_block_total_price_ww'}).find ("span", attrs={"class": "a-offscreen"}).string.strip()    except AttributeError:                current_price = ''    try:        review_count = soup.find('span', attrs={'id': 'acrCustomerReviewText'}).text.strip()    except AttributeError:        review_count = ''    try:        available_stock = soup.find('div', attrs={'id': 'availability'}).find('span').text.strip()    except AttributeError:        available_stock = ''    try:        asin = soup.find(id='averageCustomerReviews').get('data-asin')    except AttributeError:        asin = url.split('/dp/')[1].replace('/', '')    try:        rating = soup.find('span', attrs={'data-hook': 'rating-out-of-text'}).text.strip()    except AttributeError:        rating = ''    try:        xy = soup.select_one('th:-soup-contains("Best Sellers Rank") + td').text.split()[-0]    except AttributeError:        xy = ''    try:        bestseller = soup.find('div', attrs={'class': 'zg-badge-wrapper'}).find("a",attrs={"class": "badge-link"}).find("span",attrs={"class": "cat-name"}).find("span",attrs={"class": "cat-link"}).text.strip()    except AttributeError:        bestseller = ''    try:        wholeprice = soup.find('div', attrs={'class': 'a-spacing-top-mini'}).find("span", attrs= {"class": "a-offscreen"}).text.strip()     except AttributeError:        wholeprice = ''    try:        choice = soup.find('div', attrs={'class': 'ac-badge-wrapper'}).find("span", attrs= {"class": "ac-badge-text-primary ac-white"}).text.strip()     except AttributeError:        choice = ''    try:        fulfilled = soup.find('div', attrs={'class': 'offer-display-feature-text a-spacing-none'}).find("span", attrs= {"class": "a-size-small offer-display-feature-text-message"}).text.strip()     except AttributeError:            fulfilled = ''    goal = {'asin': asin,'fiyat': current_price,'rating': rating,'review': review_count,'stok': available_stock,'indirim yuzdesi': discount_percent,'bsr': xy,'bestseller':bestseller,'Amazons choice': choice,        'fulfilled by' : fulfilled    }    print(goal)    result.append(goal)    return resultdef set_delivery_location(driver, postal_code='10008'):    driver.get('https://www.amazon.com')    wait = WebDriverWait(driver, 10)    try:        location_button = wait.until(EC.element_to_be_clickable((By.ID, 'nav-global-location-slot')))        location_button.click()        postal_code_input = wait.until(EC.element_to_be_clickable((By.ID, 'GLUXZipUpdateInput')))        postal_code_input.send_keys(postal_code)        apply_button = wait.until(EC.element_to_be_clickable((By.XPATH, '//span[@id="GLUXZipUpdate"]/span/input')))        apply_button.click()        wait.until(EC.text_to_be_present_in_element((By.ID, 'glow-ingress-line2'), postal_code))    except Exception as e:        print(f"Error setting delivery location: {e}")def search_keyword(keyword, postal_code='10008'):    count_page = 0    count_asin = 0    chromedriver_path = r'C:\yarrak\chromedriver.exe'  # Update this path with the correct path to your chromedriver executable    service = Service(chromedriver_path)    options = webdriver.ChromeOptions()    options.add_argument('--headless')  # Run headless Chrome    driver = webdriver.Chrome(service=service, options=options)    # Set delivery location    set_delivery_location(driver, postal_code)    while True:        count_page += 1        url = f'https://www.amazon.com/s?k={keyword}&page={count_page}'        print(f'Getting page: {count_page} | {url}')        driver.get(url)        soup = BeautifulSoup(driver.page_source, 'lxml')        try:            results = soup.find('div', attrs={'class': 's-main-slot s-result-list s-search-results sg-row'}).find_all('div', attrs={'data-component-type': 's-search-result'})        except AttributeError:            continue        for ids in results:            count_asin += 1            asin = ids['data-asin']            url_product = f'https://www.amazon.com/dp/{asin}'            print(f'{count_asin}. {url_product}')            list_result = get_detail(url_product)            df = pd.DataFrame(list_result)            keyword = str(keyword).replace('', '+')            df.to_excel(f'result_amzn_USA_{keyword}.xlsx', index=False)        last_page = soup.find('li', {'class': 'a-disabled a-last'})        if last_page:            break    driver.quit()

Update:

Changed the code to this

    import requestsfrom bs4 import BeautifulSoupimport reimport jsonimport pandas as pdfrom faker import Fakerfrom tkinter import *import timeimport randomresult = []# Proxy listesiproxies = ["50.168.210.232","50.207.199.84","50.218.57.65",]def get_soup_retry(url):    fake = Faker()    uag_random = fake.user_agent()    header = {'User-Agent': uag_random,'Accept-Language': 'en-US,en;q=0.9'    }    isCaptcha = True    while isCaptcha:        proxy = random.choice(proxies)        proxy_dict = {"http": proxy,"https": proxy,        }        try:            page = requests.get(url, headers=header, proxies=proxy_dict, timeout=5)            page.raise_for_status()              soup = BeautifulSoup(page.content, 'lxml')            if 'captcha' in str(soup):                uag_random = fake.user_agent()                print(f'\rBot has been detected... retrying ... use new identity: {uag_random} and proxy: {proxy}', end='', flush=True)                for i in [0.5, 0.1, 1, 2, 3, 4, 5]:                    print(f"Waiting for {i} seconds")                    time.sleep(i)                continue            else:                print('Bot bypassed')                return soup        except (requests.exceptions.RequestException, requests.exceptions.ProxyError) as e:            print(f"Request failed: {e}")            continue  # Başka proxy denessssdef get_detail(url):    soup = get_soup_retry(url)...

With the proxies i got from there but its not working.

Request failed: HTTPSConnectionPool(host='www.amazon.com', port=443): Max retries exceeded with url: /s?k=leashes+for+dogs&i=pets&page=1 (Caused by ProxyError('Unable to connect to proxy', ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x00000224CF5ECDC0>, 'Connection to 50.207.199.84 timed out. (connect timeout=5)')))

LAST UPDATE

This seems to be working right now. Thanks.

import requestsfrom bs4 import BeautifulSoupimport reimport jsonimport pandas as pdfrom faker import Fakerfrom tkinter import *from selenium import webdriverfrom selenium.webdriver.chrome.service import Servicefrom selenium.webdriver.common.by import Byfrom selenium.webdriver.support.ui import WebDriverWaitfrom selenium.webdriver.support import expected_conditions as ECimport timeresult = []def get_soup_retry(url):    fake = Faker()    uag_random = fake.user_agent()    header = {'User-Agent': uag_random,'Accept-Language': 'en-US,en;q=0.9'    }    isCaptcha = True    while isCaptcha:        page = requests.get(url, headers=header, timeout=5)        soup = BeautifulSoup(page.content, 'lxml')        if 'captcha' in str(soup):            uag_random = fake.user_agent()            print(f'\rBot has been detected... retrying ... use new identity: {uag_random} ', end='', flush=True)            for i in [0.5, 1, 1.5, 2, 2.5, 3, 3.5, 4, 4.5, 5]:                print("Waiting for %s seconds" % i)                time.sleep(i)            continue        else:            print('Bot bypassed')            return soupdef get_detail(url):    soup = get_soup_retry(url)    try:                   title = soup.find('span', attrs={'id': 'productTitle'}).text.strip()    except AttributeError:        title = ''    try:        discount_percent = soup.find('div', attrs={'class': 'a-section a-spacing-none aok-align-center aok-relative'}).find('span', attrs={'class': 'a-size-large a-color-price savingPriceOverride aok-align-center reinventPriceSavingsPercentageMargin savingsPercentage'}).text.strip()    except AttributeError:        discount_percent = ''    try:        current_price = soup.find('div', attrs={'id': 'tp_price_row_ww'}).find('span', attrs={'id': 'tp_price_block_total_price_ww'}).find("span", attrs={"class": "a-offscreen"}).text.strip()    except AttributeError:                current_price = ''    try:        review_count = soup.find('span', attrs={'id': 'acrCustomerReviewText'}).text.strip()    except AttributeError:        review_count = ''    try:        available_stock = soup.find('div', attrs={'id': 'availability'}).find('span').text.strip()    except AttributeError:        available_stock = ''    try:        asin = soup.find(id='averageCustomerReviews').get('data-asin')    except AttributeError:        asin = url.split('/dp/')[1].replace('/', '')    try:        rating = soup.find('span', attrs={'data-hook': 'rating-out-of-text'}).text.strip()    except AttributeError:        rating = ''    try:        xy = soup.select_one('th:-soup-contains("Best Sellers Rank") + td').text.split()[0]    except AttributeError:        xy = ''    try:        bestseller = soup.find('div', attrs={'class': 'zg-badge-wrapper'}).find("a", attrs={"class": "badge-link"}).find("span", attrs={"class": "cat-name"}).find("span", attrs={"class": "cat-link"}).text.strip()    except AttributeError:        bestseller = ''    try:        wholeprice = soup.find('div', attrs={'class': 'a-spacing-top-mini'}).find("span", attrs= {"class": "a-offscreen"}).text.strip()     except AttributeError:        wholeprice = ''    try:        choice = soup.find('div', attrs={'class': 'ac-badge-wrapper'}).find("span", attrs= {"class": "ac-badge-text-primary ac-white"}).text.strip()     except AttributeError:        choice = ''    try:        fulfilled = soup.find('div', attrs={'class': 'offer-display-feature-text a-spacing-none'}).find("span", attrs= {"class": "a-size-small offer-display-feature-text-message"}).text.strip()     except AttributeError:        fulfilled = ''    goal = {'asin': asin,'title': title,'price': current_price,'rating': rating,'reviews': review_count,'stock': available_stock,'discount_percentage': discount_percent,'best_sellers_rank': xy,'bestseller': bestseller,'whole_price': wholeprice,'amazon_choice': choice,'fulfilled_by': fulfilled    }    print(goal)    result.append(goal)    return resultdef set_delivery_location(postal_code):    chromedriver_path = r'C:\yarrak\chromedriver.exe'    service = Service(chromedriver_path)    options = webdriver.ChromeOptions()    options.add_argument('--headless')  # Run headless Chrome    driver = webdriver.Chrome(service=service, options=options)    driver.get('https://www.amazon.com')    wait = WebDriverWait(driver, 10)    try:        location_button = wait.until(EC.element_to_be_clickable((By.ID, 'nav-global-location-slot')))        location_button.click()        postal_code_input = wait.until(EC.element_to_be_clickable((By.ID, 'GLUXZipUpdateInput')))        postal_code_input.send_keys(postal_code)        apply_button = wait.until(EC.element_to_be_clickable((By.XPATH, '//span[@id="GLUXZipUpdate"]/span/input')))        apply_button.click()        wait.until(EC.text_to_be_present_in_element((By.ID, 'glow-ingress-line2'), postal_code))    except Exception as e:        print(f"Error setting delivery location: {e}")    finally:        driver.quit()def search_keyword(keyword, postal_code='10008'):    set_delivery_location(postal_code)    count_page = 0    count_asin = 0    while True:        count_page += 1        url = f'https://www.amazon.com/s?k={keyword}&page={count_page}'        print(f'Getting page: {count_page} | {url}')        soup = get_soup_retry(url)        try:            result = soup.find('div', attrs={'class': 's-main-slot s-result-list s-search-results sg-row'}).find_all('div', attrs={'data-component-type': 's-search-result'})        except AttributeError:            continue        for ids in result:            count_asin += 1            asin = ids['data-asin']            url_product = f'https://www.amazon.com/dp/{asin}'            print(f'{count_asin}. {url_product}')            list_result = get_detail(url_product)            df = pd.DataFrame(list_result)            keyword = str(keyword).replace('', '+')            df.to_excel(f'result_amzn_USA_{keyword}.xlsx', index=False)        last_page = soup.find('li', {'class': 'a-disabled a-last'})        if last_page:            break

Viewing all articles
Browse latest Browse all 23276

Trending Articles



<script src="https://jsc.adskeeper.com/r/s/rssing.com.1596347.js" async> </script>