My objective is that for website https://data.eastmoney.com/executive/000001.html, when you scroll down you will find a big table
and I want to turn it into a DataFrame in Python. Is BeautifulSoup enough to do so or do I have to use Selenium?
Some people on Stack Overflow said that BeautifulSoup cannot crawl table data from the Internet, so I tried Selenium and here is the code:
driver = webdriver.Chrome()driver.get('https://data.eastmoney.com/executive/000001.html')table_element = driver.find_element_by_xpath("//table")item_element = table_element.find_element_by_xpath("//tr[2]/td[3]")item_text = item_element.textdf = pd.DataFrame([item_text], columns=["Item"])print(df)driver.quit()and here is the outcome:
Traceback (most recent call last): File "selenium/webdriver/common/service.py", line 76, in start stdin=PIPE) File "subprocess.py", line 709, in __init__ restore_signals, start_new_session) File "subprocess.py", line 1344, in _execute_child raise child_exception_type(errno_num, err_msg, err_filename)FileNotFoundError: [Errno 2] No such file or directory: 'chromedriver': 'chromedriver'During handling of the above exception, another exception occurred:Traceback (most recent call last): File "/tmp/jqcore/jqboson/jqboson/core/entry.py", line 379, in _run engine.start() File "/tmp/jqcore/jqboson/jqboson/core/engine.py", line 231, in start self._dispatcher.start() File "/tmp/jqcore/jqboson/jqboson/core/dispatcher.py", line 280, in start self._run_loop() File "/tmp/jqcore/jqboson/jqboson/core/dispatcher.py", line 240, in _run_loop self._loop.run() File "/tmp/jqcore/jqboson/jqboson/core/loop/loop.py", line 107, in run self._handle_queue() File "/tmp/jqcore/jqboson/jqboson/core/loop/loop.py", line 153, in _handle_queue message.callback(**message.callback_data) File "/tmp/jqcore/jqboson/jqboson/core/mds/market_data_subscriber.py", line 228, in broadcast consumer.send(market_data) File "/tmp/jqcore/jqboson/jqboson/core/mds/market_data_consumer_manager.py", line 59, in consumer_gen msg_callback() File "/tmp/jqcore/jqboson/jqboson/core/mds/market_data_consumer_manager.py", line 52, in msg_callback callback(market_data) File "/tmp/jqcore/jqboson/jqboson/core/mds/market_data_consumer_manager.py", line 122, in wrapper result = callback(*args, **kwargs) File "/tmp/jqcore/jqboson/jqboson/core/strategy.py", line 474, in _wrapper self._context.current_dt File "/tmp/strategy/user_code.py", line 85, in handle_data driver = webdriver.Chrome() File "selenium/webdriver/chrome/webdriver.py", line 73, in __init__ self.service.start() File "selenium/webdriver/common/service.py", line 83, in start os.path.basename(self.path), self.start_error_message)selenium.common.exceptions.WebDriverException: Message: 'chromedriver' executable needs to be in PATH. Please see https://sites.google.com/a/chromium.org/chromedriver/homeBasically it says "Chromedriver.exe needs to be in PATH". The problem is I am using an online backtest platform called JoinQuant (www.joinquant.com) and all the Python files such as File "selenium/webdriver/common/service.py" are not local - they are not on my computer's disk drive. So that's something complicated with Selenium - Do I have to use Selenium to crawl data like this from the Internet and turn it into DataFrame in Python? Or can I use something else like BeautifulSoup? For BeautifulSoup, as least it does not have the "drive needs to be in PATH" problem.
For BeautifulSoup, here's what I tried:
# Web Crawler# Sent HTTP Request to get Internet contenturl = 'https://data.eastmoney.com/executive/000001.html'response = requests.get(url)html_content = response.text# Check if the request is successfulif response.status_code == 200: # Use BeautifulSoup to Analyze Internet information and get the table soup = BeautifulSoup(html_content, 'html.parser') table = soup.find_all('table') # Acquire the rows and columns of the table rows = table.find_all('tr') data = [] for row in rows: cols = row.find_all('td') row_data = [] for col in cols: row_data.append(col.text.strip()) data.append(row_data)else: print("Failed to Retrieve the Webpage.")# Set up DataFramedataframe = pd.DataFrame(data)# Print DataFrameprint(dataframe)and here's the output:
Traceback (most recent call last): File "/tmp/jqcore/jqboson/jqboson/core/entry.py", line 379, in _run engine.start() File "/tmp/jqcore/jqboson/jqboson/core/engine.py", line 231, in start self._dispatcher.start() File "/tmp/jqcore/jqboson/jqboson/core/dispatcher.py", line 280, in start self._run_loop() File "/tmp/jqcore/jqboson/jqboson/core/dispatcher.py", line 240, in _run_loop self._loop.run() File "/tmp/jqcore/jqboson/jqboson/core/loop/loop.py", line 107, in run self._handle_queue() File "/tmp/jqcore/jqboson/jqboson/core/loop/loop.py", line 153, in _handle_queue message.callback(**message.callback_data) File "/tmp/jqcore/jqboson/jqboson/core/mds/market_data_subscriber.py", line 228, in broadcast consumer.send(market_data) File "/tmp/jqcore/jqboson/jqboson/core/mds/market_data_consumer_manager.py", line 59, in consumer_gen msg_callback() File "/tmp/jqcore/jqboson/jqboson/core/mds/market_data_consumer_manager.py", line 52, in msg_callback callback(market_data) File "/tmp/jqcore/jqboson/jqboson/core/mds/market_data_consumer_manager.py", line 122, in wrapper result = callback(*args, **kwargs) File "/tmp/jqcore/jqboson/jqboson/core/strategy.py", line 474, in _wrapper self._context.current_dt File "/tmp/strategy/user_code.py", line 114, in handle_data rows = table.find_all('tr') File "bs4/element.py", line 1884, in __getattr__"ResultSet object has no attribute '%s'. You're probably treating a list of items like a single item. Did you call find_all() when you meant to call find()?" % keyAttributeError: ResultSet object has no attribute 'find_all'. You're probably treating a list of items like a single item. Did you call find_all() when you meant to call find()?But if you change
table = soup.find_all('table')into
table = soup.find('table')Here's the outcome:
Traceback (most recent call last): File "/tmp/jqcore/jqboson/jqboson/core/entry.py", line 379, in _run engine.start() File "/tmp/jqcore/jqboson/jqboson/core/engine.py", line 231, in start self._dispatcher.start() File "/tmp/jqcore/jqboson/jqboson/core/dispatcher.py", line 280, in start self._run_loop() File "/tmp/jqcore/jqboson/jqboson/core/dispatcher.py", line 240, in _run_loop self._loop.run() File "/tmp/jqcore/jqboson/jqboson/core/loop/loop.py", line 107, in run self._handle_queue() File "/tmp/jqcore/jqboson/jqboson/core/loop/loop.py", line 153, in _handle_queue message.callback(**message.callback_data) File "/tmp/jqcore/jqboson/jqboson/core/mds/market_data_subscriber.py", line 228, in broadcast consumer.send(market_data) File "/tmp/jqcore/jqboson/jqboson/core/mds/market_data_consumer_manager.py", line 59, in consumer_gen msg_callback() File "/tmp/jqcore/jqboson/jqboson/core/mds/market_data_consumer_manager.py", line 52, in msg_callback callback(market_data) File "/tmp/jqcore/jqboson/jqboson/core/mds/market_data_consumer_manager.py", line 122, in wrapper result = callback(*args, **kwargs) File "/tmp/jqcore/jqboson/jqboson/core/strategy.py", line 474, in _wrapper self._context.current_dt File "/tmp/strategy/user_code.py", line 114, in handle_data rows = table.find_all('tr')AttributeError: 'NoneType' object has no attribute 'find_all'So to sum all up, which one should I use? Selenium or BeautifulSoup? Or even something else? And how should I tackle this issue?