I am new to all these, but i want to make an amazon product detail scrapper with python.
I made some searches and find a code . I also made some improvements (such as div, span names etc.) and this is my last code. But, i have a problem.
import requestsfrom bs4 import BeautifulSoupimport reimport jsonimport pandas as pdfrom faker import Fakerfrom tkinter import *import time result = []def get_soup_retry(url): fake = Faker() uag_random = fake.user_agent() header = {'User-Agent': uag_random,'Accept-Language': 'en-US,en;q=0.9' } isCaptcha = True while isCaptcha: page = requests.get(url, headers=header, timeout=5) soup = BeautifulSoup(page.content, 'lxml') if 'captcha' in str(soup): uag_random = fake.user_agent() print(f'\rBot has been detected... retrying ... use new identity: {uag_random} ', end='', flush=True) for i in [ .5, .1, 1, 2, 3, 4, 5 ]: print("Waiting for %s" % i , end='') print(" seconds") time.sleep(i) continue else: print('Bot bypassed') return soupdef get_detail(url): soup = get_soup_retry(url) b = soup.find("td", attr= {"class": "a-color-secondary a-size-base prodDetSectionEntry "}) try: title = soup.find('span', attrs={'id': 'productTitle'}).find() # to get the text, and strip is used to remove all the leading and trailing spaces from a string. except AttributeError: title = '' try: discount_percent = soup.find('div', attrs={'class': 'a-section a-spacing-none aok-align-center aok-relative'}).find('span', attrs={'class': 'a-size-large a-color-price savingPriceOverride aok-align-center reinventPriceSavingsPercentageMargin savingsPercentage'}).string.strip() except AttributeError: discount_percent = '' try: current_price = soup.find('div', attrs={'id': 'tp_price_row_ww'}).find('span', attrs={'id': 'tp_price_block_total_price_ww'}).find ("span", attrs={"class": "a-offscreen"}).string.strip() except AttributeError: current_price = '' try: review_count = soup.find('span', attrs={'id': 'acrCustomerReviewText'}).text.strip() except AttributeError: review_count = '' try: available_stock = soup.find('div', attrs={'id': 'availability'}).find('span').text.strip() except AttributeError: available_stock = '' try: asin = soup.find(id='averageCustomerReviews').get('data-asin') except AttributeError: asin = url.split('/dp/')[1].replace('/', '') try: rating = soup.find('span', attrs={'data-hook': 'rating-out-of-text'}).text.strip() except AttributeError: rating = '' try: xy = soup.select_one('th:-soup-contains("Best Sellers Rank") + td').text.split()[-0] except AttributeError: xy = '' try: bestseller = soup.find('div', attrs={'class': 'zg-badge-wrapper'}).find("a",attrs={"class": "badge-link"}).find("span",attrs={"class": "cat-name"}).find("span",attrs={"class": "cat-link"}).text.strip() except AttributeError: bestseller = '' try: wholeprice = soup.find('div', attrs={'class': 'a-spacing-top-mini'}).find("span", attrs= {"class": "a-offscreen"}).text.strip() except AttributeError: wholeprice = '' try: choice = soup.find('div', attrs={'class': 'ac-badge-wrapper'}).find("span", attrs= {"class": "ac-badge-text-primary ac-white"}).text.strip() except AttributeError: choice = '' try: fulfilled = soup.find('div', attrs={'class': 'offer-display-feature-text a-spacing-none'}).find("span", attrs= {"class": "a-size-small offer-display-feature-text-message"}).text.strip() except AttributeError: fulfilled = '' goal = {'asin': asin,'fiyat': current_price,'rating': rating,'review': review_count,'stok': available_stock,'indirim yuzdesi': discount_percent,'bsr': xy,'bestseller':bestseller,'Amazons choice': choice, 'fulfilled by' : fulfilled } print(goal) result.append(goal) return resultdef search_keyword(keyword): count_page = 0 count_asin = 0 while True: count_page += 1 url = f'https://www.amazon.com/s?k={keyword}&page={count_page}' print(f'Getting page: {count_page} | {url}') soup = get_soup_retry(url) try: result = soup.find('div', attrs={'class': 's-main-slot s-result-list s-search-results sg-row'}).find_all('div', attrs={'data-component-type': 's-search-result'}) except AttributeError: continue for ids in result: count_asin += 1 asin = ids['data-asin'] url_product = f'https://www.amazon.com/dp/{asin}' print(f'{count_asin}. {url_product}') list_result = get_detail(url_product) df = pd.DataFrame(list_result) keyword = str(keyword).replace('', '+') # df.to_csv(f'result_amzn_{keyword}.csv', index=False) df.to_excel(f'result_amzn_USA_{keyword}.xlsx', index=False) # if count_asin % 50 == 0: # sleep = random.randint(3, 9) # print(f'calm down {sleep} seconds') # time.sleep(sleep) # bestseller = soup.find('div', attrs={'class': 'zg-badge-wrapper'}).find("a",attrs={"class": "badge-link"}).find("span",attrs={"class": "cat-name"}).find("span",attrs={"class": "cat-link"}).text.strip() last_page = soup.find('li', {'class': 'a-disabled a-last'}) if not last_page: pass else: breakIn the Amazon.com website, as u all know, you should enter a US postcode or enter your "Country"
I want the "select" the US-Postcode (for example 10008) to see all products shipping for 10008.
Chatgpt recommended using "Selenium" and send me a "revised" code but its not working. I mean, theoutputs are not correct, code is ok.
Can you help me with selenium? Or can you give me advice?
Thanks in advance.
(I saw a thread here that recommending use "aiohttp" but im not sure about that. I want the scrapper both USA and JP, even for the SG but can i find the right proxies for that countries? Also what about "Faker"'s user agents?
Also, here's my last code to run the py files above.
import sysimport USAmain as amznwhile True: print('=========== ==========') print('=========== USA ==========') print('| 1. Keyword Girin |') print('| 2. Çıkış |') print('=================================') menu = input('Type 1/2: ') if menu == '1': kwd = input('Keyword Girin: ') print ("Kategoriler:") print ("Automotive= 'automotive'") print ("Home & Kitchen= 'garden'") print ("Baby= 'baby-products'") print ("Beauty & Personal care= 'beauty'") print ("Electronics= 'electronics'") print ("Health & Household= 'hpc'") print ("Pet Supplies= 'pets'") print ("Industrial & Scientific= 'industrial'") print ("Sports & Outdoors= 'sporting'") print ("Tools & Home Improvement= 'tools'") print ("Toys & Games= 'toys-and-games'") print ("Appliances = 'appliances'") print ("Cell Phones & Accessories= 'mobile'") print ("Clothing, Shoes & Jewelry= 'fashion'") print ("Office Products= 'office-products'") kw = kwd.replace('', '+') category = input("Yukarıdaki Kategorilerden İstediğinizi Girin:" ) a = kw + ("&i=") + category amzn.search_keyword(a) back = input('\nback to menu? y/n: ') if back == 'y': continue else: break else: print('see u') sys.exit()I tried using Selenium with the help of "chatgpt" but its not working as intended.
Chatgpt codes
import requestsfrom bs4 import BeautifulSoupimport reimport jsonimport pandas as pdfrom faker import Fakerfrom selenium import webdriverfrom selenium.webdriver.chrome.service import Servicefrom selenium.webdriver.common.by import Byfrom selenium.webdriver.common.keys import Keysfrom selenium.webdriver.support.ui import WebDriverWaitfrom selenium.webdriver.support import expected_conditions as ECimport timeimport osresult = []def get_soup_retry(url): fake = Faker() uag_random = fake.user_agent() header = {'User-Agent': uag_random,'Accept-Language': 'en-US,en;q=0.9' } isCaptcha = True while isCaptcha: page = requests.get(url, headers=header, timeout=5) soup = BeautifulSoup(page.content, 'lxml') if 'captcha' in str(soup): uag_random = fake.user_agent() print(f'\rBot has been detected... retrying ... use new identity: {uag_random} ', end='', flush=True) for i in [0.5, 1, 1.5, 2, 2.5, 3, 3.5, 4, 4.5, 5]: print("Waiting for %s seconds" % i) time.sleep(i) continue else: print('Bot bypassed') return soupdef get_detail(url): soup = get_soup_retry(url) b = soup.find("td", attr= {"class": "a-color-secondary a-size-base prodDetSectionEntry "}) try: title = soup.find('span', attrs={'id': 'productTitle'}).find() # to get the text, and strip is used to remove all the leading and trailing spaces from a string. except AttributeError: title = '' try: discount_percent = soup.find('div', attrs={'class': 'a-section a-spacing-none aok-align-center aok-relative'}).find('span', attrs={'class': 'a-size-large a-color-price savingPriceOverride aok-align-center reinventPriceSavingsPercentageMargin savingsPercentage'}).string.strip() except AttributeError: discount_percent = '' try: current_price = soup.find('div', attrs={'id': 'tp_price_row_ww'}).find('span', attrs={'id': 'tp_price_block_total_price_ww'}).find ("span", attrs={"class": "a-offscreen"}).string.strip() except AttributeError: current_price = '' try: review_count = soup.find('span', attrs={'id': 'acrCustomerReviewText'}).text.strip() except AttributeError: review_count = '' try: available_stock = soup.find('div', attrs={'id': 'availability'}).find('span').text.strip() except AttributeError: available_stock = '' try: asin = soup.find(id='averageCustomerReviews').get('data-asin') except AttributeError: asin = url.split('/dp/')[1].replace('/', '') try: rating = soup.find('span', attrs={'data-hook': 'rating-out-of-text'}).text.strip() except AttributeError: rating = '' try: xy = soup.select_one('th:-soup-contains("Best Sellers Rank") + td').text.split()[-0] except AttributeError: xy = '' try: bestseller = soup.find('div', attrs={'class': 'zg-badge-wrapper'}).find("a",attrs={"class": "badge-link"}).find("span",attrs={"class": "cat-name"}).find("span",attrs={"class": "cat-link"}).text.strip() except AttributeError: bestseller = '' try: wholeprice = soup.find('div', attrs={'class': 'a-spacing-top-mini'}).find("span", attrs= {"class": "a-offscreen"}).text.strip() except AttributeError: wholeprice = '' try: choice = soup.find('div', attrs={'class': 'ac-badge-wrapper'}).find("span", attrs= {"class": "ac-badge-text-primary ac-white"}).text.strip() except AttributeError: choice = '' try: fulfilled = soup.find('div', attrs={'class': 'offer-display-feature-text a-spacing-none'}).find("span", attrs= {"class": "a-size-small offer-display-feature-text-message"}).text.strip() except AttributeError: fulfilled = '' goal = {'asin': asin,'fiyat': current_price,'rating': rating,'review': review_count,'stok': available_stock,'indirim yuzdesi': discount_percent,'bsr': xy,'bestseller':bestseller,'Amazons choice': choice, 'fulfilled by' : fulfilled } print(goal) result.append(goal) return resultdef set_delivery_location(driver, postal_code='10008'): driver.get('https://www.amazon.com') wait = WebDriverWait(driver, 10) try: location_button = wait.until(EC.element_to_be_clickable((By.ID, 'nav-global-location-slot'))) location_button.click() postal_code_input = wait.until(EC.element_to_be_clickable((By.ID, 'GLUXZipUpdateInput'))) postal_code_input.send_keys(postal_code) apply_button = wait.until(EC.element_to_be_clickable((By.XPATH, '//span[@id="GLUXZipUpdate"]/span/input'))) apply_button.click() wait.until(EC.text_to_be_present_in_element((By.ID, 'glow-ingress-line2'), postal_code)) except Exception as e: print(f"Error setting delivery location: {e}")def search_keyword(keyword, postal_code='10008'): count_page = 0 count_asin = 0 chromedriver_path = r'C:\yarrak\chromedriver.exe' # Update this path with the correct path to your chromedriver executable service = Service(chromedriver_path) options = webdriver.ChromeOptions() options.add_argument('--headless') # Run headless Chrome driver = webdriver.Chrome(service=service, options=options) # Set delivery location set_delivery_location(driver, postal_code) while True: count_page += 1 url = f'https://www.amazon.com/s?k={keyword}&page={count_page}' print(f'Getting page: {count_page} | {url}') driver.get(url) soup = BeautifulSoup(driver.page_source, 'lxml') try: results = soup.find('div', attrs={'class': 's-main-slot s-result-list s-search-results sg-row'}).find_all('div', attrs={'data-component-type': 's-search-result'}) except AttributeError: continue for ids in results: count_asin += 1 asin = ids['data-asin'] url_product = f'https://www.amazon.com/dp/{asin}' print(f'{count_asin}. {url_product}') list_result = get_detail(url_product) df = pd.DataFrame(list_result) keyword = str(keyword).replace('', '+') df.to_excel(f'result_amzn_USA_{keyword}.xlsx', index=False) last_page = soup.find('li', {'class': 'a-disabled a-last'}) if last_page: break driver.quit()Update:
Changed the code to this
import requestsfrom bs4 import BeautifulSoupimport reimport jsonimport pandas as pdfrom faker import Fakerfrom tkinter import *import timeimport randomresult = []# Proxy listesiproxies = ["50.168.210.232","50.207.199.84","50.218.57.65",]def get_soup_retry(url): fake = Faker() uag_random = fake.user_agent() header = {'User-Agent': uag_random,'Accept-Language': 'en-US,en;q=0.9' } isCaptcha = True while isCaptcha: proxy = random.choice(proxies) proxy_dict = {"http": proxy,"https": proxy, } try: page = requests.get(url, headers=header, proxies=proxy_dict, timeout=5) page.raise_for_status() soup = BeautifulSoup(page.content, 'lxml') if 'captcha' in str(soup): uag_random = fake.user_agent() print(f'\rBot has been detected... retrying ... use new identity: {uag_random} and proxy: {proxy}', end='', flush=True) for i in [0.5, 0.1, 1, 2, 3, 4, 5]: print(f"Waiting for {i} seconds") time.sleep(i) continue else: print('Bot bypassed') return soup except (requests.exceptions.RequestException, requests.exceptions.ProxyError) as e: print(f"Request failed: {e}") continue # Başka proxy denessssdef get_detail(url): soup = get_soup_retry(url)...With the proxies i got from there but its not working.
Request failed: HTTPSConnectionPool(host='www.amazon.com', port=443): Max retries exceeded with url: /s?k=leashes+for+dogs&i=pets&page=1 (Caused by ProxyError('Unable to connect to proxy', ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x00000224CF5ECDC0>, 'Connection to 50.207.199.84 timed out. (connect timeout=5)')))
LAST UPDATE
This seems to be working right now. Thanks.
import requestsfrom bs4 import BeautifulSoupimport reimport jsonimport pandas as pdfrom faker import Fakerfrom tkinter import *from selenium import webdriverfrom selenium.webdriver.chrome.service import Servicefrom selenium.webdriver.common.by import Byfrom selenium.webdriver.support.ui import WebDriverWaitfrom selenium.webdriver.support import expected_conditions as ECimport timeresult = []def get_soup_retry(url): fake = Faker() uag_random = fake.user_agent() header = {'User-Agent': uag_random,'Accept-Language': 'en-US,en;q=0.9' } isCaptcha = True while isCaptcha: page = requests.get(url, headers=header, timeout=5) soup = BeautifulSoup(page.content, 'lxml') if 'captcha' in str(soup): uag_random = fake.user_agent() print(f'\rBot has been detected... retrying ... use new identity: {uag_random} ', end='', flush=True) for i in [0.5, 1, 1.5, 2, 2.5, 3, 3.5, 4, 4.5, 5]: print("Waiting for %s seconds" % i) time.sleep(i) continue else: print('Bot bypassed') return soupdef get_detail(url): soup = get_soup_retry(url) try: title = soup.find('span', attrs={'id': 'productTitle'}).text.strip() except AttributeError: title = '' try: discount_percent = soup.find('div', attrs={'class': 'a-section a-spacing-none aok-align-center aok-relative'}).find('span', attrs={'class': 'a-size-large a-color-price savingPriceOverride aok-align-center reinventPriceSavingsPercentageMargin savingsPercentage'}).text.strip() except AttributeError: discount_percent = '' try: current_price = soup.find('div', attrs={'id': 'tp_price_row_ww'}).find('span', attrs={'id': 'tp_price_block_total_price_ww'}).find("span", attrs={"class": "a-offscreen"}).text.strip() except AttributeError: current_price = '' try: review_count = soup.find('span', attrs={'id': 'acrCustomerReviewText'}).text.strip() except AttributeError: review_count = '' try: available_stock = soup.find('div', attrs={'id': 'availability'}).find('span').text.strip() except AttributeError: available_stock = '' try: asin = soup.find(id='averageCustomerReviews').get('data-asin') except AttributeError: asin = url.split('/dp/')[1].replace('/', '') try: rating = soup.find('span', attrs={'data-hook': 'rating-out-of-text'}).text.strip() except AttributeError: rating = '' try: xy = soup.select_one('th:-soup-contains("Best Sellers Rank") + td').text.split()[0] except AttributeError: xy = '' try: bestseller = soup.find('div', attrs={'class': 'zg-badge-wrapper'}).find("a", attrs={"class": "badge-link"}).find("span", attrs={"class": "cat-name"}).find("span", attrs={"class": "cat-link"}).text.strip() except AttributeError: bestseller = '' try: wholeprice = soup.find('div', attrs={'class': 'a-spacing-top-mini'}).find("span", attrs= {"class": "a-offscreen"}).text.strip() except AttributeError: wholeprice = '' try: choice = soup.find('div', attrs={'class': 'ac-badge-wrapper'}).find("span", attrs= {"class": "ac-badge-text-primary ac-white"}).text.strip() except AttributeError: choice = '' try: fulfilled = soup.find('div', attrs={'class': 'offer-display-feature-text a-spacing-none'}).find("span", attrs= {"class": "a-size-small offer-display-feature-text-message"}).text.strip() except AttributeError: fulfilled = '' goal = {'asin': asin,'title': title,'price': current_price,'rating': rating,'reviews': review_count,'stock': available_stock,'discount_percentage': discount_percent,'best_sellers_rank': xy,'bestseller': bestseller,'whole_price': wholeprice,'amazon_choice': choice,'fulfilled_by': fulfilled } print(goal) result.append(goal) return resultdef set_delivery_location(postal_code): chromedriver_path = r'C:\yarrak\chromedriver.exe' service = Service(chromedriver_path) options = webdriver.ChromeOptions() options.add_argument('--headless') # Run headless Chrome driver = webdriver.Chrome(service=service, options=options) driver.get('https://www.amazon.com') wait = WebDriverWait(driver, 10) try: location_button = wait.until(EC.element_to_be_clickable((By.ID, 'nav-global-location-slot'))) location_button.click() postal_code_input = wait.until(EC.element_to_be_clickable((By.ID, 'GLUXZipUpdateInput'))) postal_code_input.send_keys(postal_code) apply_button = wait.until(EC.element_to_be_clickable((By.XPATH, '//span[@id="GLUXZipUpdate"]/span/input'))) apply_button.click() wait.until(EC.text_to_be_present_in_element((By.ID, 'glow-ingress-line2'), postal_code)) except Exception as e: print(f"Error setting delivery location: {e}") finally: driver.quit()def search_keyword(keyword, postal_code='10008'): set_delivery_location(postal_code) count_page = 0 count_asin = 0 while True: count_page += 1 url = f'https://www.amazon.com/s?k={keyword}&page={count_page}' print(f'Getting page: {count_page} | {url}') soup = get_soup_retry(url) try: result = soup.find('div', attrs={'class': 's-main-slot s-result-list s-search-results sg-row'}).find_all('div', attrs={'data-component-type': 's-search-result'}) except AttributeError: continue for ids in result: count_asin += 1 asin = ids['data-asin'] url_product = f'https://www.amazon.com/dp/{asin}' print(f'{count_asin}. {url_product}') list_result = get_detail(url_product) df = pd.DataFrame(list_result) keyword = str(keyword).replace('', '+') df.to_excel(f'result_amzn_USA_{keyword}.xlsx', index=False) last_page = soup.find('li', {'class': 'a-disabled a-last'}) if last_page: break