Quantcast
Channel: Active questions tagged python - Stack Overflow
Viewing all articles
Browse latest Browse all 13951

Scraping Power BI Dashboard with Selenium

$
0
0

I'm having trouble scraping a Power BI Dashboard using Selenium. I scrape the url correctly it seems and have good code structure, but the code fails to successfully parse all column after the first (Job Name).

I don't need to click on anything, just scroll down the page to extract all data.

The length of the state data is only 150 while the job name is 362. And then the following column, Licensed, Registered or Certified by the State only returned a len(licensed_data) of 62. I stopped putting in the scroll code as I receive the above errors before getting vey far. I keep the div the same as the page has the same html structure throughout.

If anyone could help me understand why I'm messing up, that would be much appreciated. Again, I'm only trying to scrape the table in the above dash.

import timefrom selenium.webdriver import Chrome, ChromeOptionsfrom selenium.webdriver.common.by import Byfrom selenium.webdriver.support.ui import WebDriverWaitfrom selenium.webdriver.support import expected_conditions as ECfrom selenium import webdriveroption = webdriver.ChromeOptions()option.add_argument("--start-maximized")driver = webdriver.Chrome(options=option)wait = WebDriverWait(driver, 10)# Load the pagedriver.get("https://app.powerbi.com/view?r=eyJrIjoiNzA0MGM4NGMtN2E5Ny00NDU3LWJiNzMtOWFlMGIyMDczZjg2IiwidCI6IjM4MmZiOGIwLTRkYzMtNDEwNy04MGJkLTM1OTViMjQzMmZhZSIsImMiOjZ9&pageName=ReportSection")job_name_data_element = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div[aria-label="Job Name"]')))# Scroll down to the bottom of the page to load all the datawhile True:    # Scroll down using JavaScript    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")    time.sleep(1)  # Adjust sleep time according to your page load speed    # Check if we have reached the bottom of the page    if driver.execute_script("return window.innerHeight + window.scrollY") >= driver.execute_script("return document.body.scrollHeight"):        break# Extract the text from the Job Name data element after scrollingjob_name_data = job_name_data_element.text## statestate_column_element = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div[aria-label="State"]')))# Scroll down to the bottom of the page to load all the datawhile True:    # Scroll down using JavaScript    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")    time.sleep(1)  # Adjust sleep time according to your page load speed    # Check if we have reached the bottom of the page    if driver.execute_script("return window.innerHeight + window.scrollY") >= driver.execute_script("return document.body.scrollHeight"):        breakstate_data = state_column_element.text## licenselicensed_column_element = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div[aria-label="Licensed"]')))while True:    # Scroll down using JavaScript    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")    time.sleep(1)  # Adjust sleep time according to your page load speed    # Check if we have reached the bottom of the page    if driver.execute_script("return window.innerHeight + window.scrollY") >= driver.execute_script("return document.body.scrollHeight"):        breaklicensed_data = licensed_column_element.textlen(licensed_data)## educationeducation_column_element = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div[aria-label="Education Requirement"]')))while True:    # Scroll down using JavaScript    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")    time.sleep(1)  # Adjust sleep time according to your page load speed    # Check if we have reached the bottom of the page    if driver.execute_script("return window.innerHeight + window.scrollY") >= driver.execute_script("return document.body.scrollHeight"):        breakeducation_data = education_column_element.text## trainingtraining_column_element = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div[aria-label="Amount of Training Required [In Hours]"]')))training_data = training_column_element.text## experienceexperience_column_element = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div[aria-label="Amount of Experience Required"]')))experience_data = experience_column_element.text## pro examexam_column_element = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div[aria-label="Professional Exam"]')))exam_data = exam_column_element.text## renewal timerenewal_time_column_element = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div[aria-label="Required Time of License Renewal (In Years)"]')))renewal_time_data = renewal_time_column_element.text## continious educationcontinious_education_column_element = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div[aria-label="Continuing Education Requirement"]')))continious_education_column_element_data = continious_education_column_element.text## additional examsadditional_exams_column_element = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div[aria-label="Additional Required Exams"]')))additional_exams_column_element_data = additional_exams_column_element.text## continious educationcost_of_licensure_column_element = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div[aria-label="Cost of Initial Licensure (In Dollars)"]')))cost_of_licensure_column_element_data = cost_of_licensure_column_element.text## license renewallicense_renewal_column_element = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div[aria-label="Cost of License Renewal (In Dollars)"]')))license_renewal_column_element_data = license_renewal_column_element.text## reciprocityreciprocity_column_element = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div[aria-label="Reciprocity or Endorsement"]')))reciprocity_column_element_data = reciprocity_column_element.text## charactercharacter_column_element = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div[aria-label="Good Moral Character Requirement"]')))character_column_element_data = character_column_element.text## blanket banban_column_element = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div[aria-label="Blanket Ban for Ex-Offenders"]')))ban_column_element_data = ban_column_element.text## rehabrehab_column_element = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div[aria-label="Rehabilitation Requirement"]')))rehab_column_element_data = rehab_column_element.text## rehabrehab_column_element = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div[aria-label="Rehabilitation Requirement"]')))rehab_column_element_data = rehab_column_element.text## relationshiprelationship_column_element = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div[aria-label="Relationship between Offense and Occupation"]')))relationship_column_element_data = relationship_column_element.text##  Limitationslimitations_column_element = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div[aria-label="Limitations on Scope of Inquiry"]')))limitations_column_element_data = limitations_column_element.text##  ageage_column_element = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div[aria-label="Minimum Age (In Years)"]')))age_column_element_data = age_column_element.text

Viewing all articles
Browse latest Browse all 13951

Trending Articles



<script src="https://jsc.adskeeper.com/r/s/rssing.com.1596347.js" async> </script>