From 425998a75f6298037530bd7fac14ba21deca83a9 Mon Sep 17 00:00:00 2001 From: Abhay Singhal Date: Wed, 2 Mar 2022 21:12:32 -0800 Subject: [PATCH] fixed scraping target class name, added ability to get headcount directly --- linkedin_scraper/company.py | 14 ++++++++++++-- linkedin_scraper/person.py | 2 +- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/linkedin_scraper/company.py b/linkedin_scraper/company.py index f314125..e8d1e57 100644 --- a/linkedin_scraper/company.py +++ b/linkedin_scraper/company.py @@ -42,8 +42,10 @@ class Company(Scraper): company_type = None company_size = None specialties = None - showcase_pages =[] + showcase_pages = [] affiliated_companies = [] + employees = [] + headcount = None def __init__(self, linkedin_url = None, name = None, about_us =None, website = None, headquarters = None, founded = None, industry = None, company_type = None, company_size = None, specialties = None, showcase_pages =[], affiliated_companies = [], driver = None, scrape = True, get_employees = True, close_on_complete = True): self.linkedin_url = linkedin_url @@ -205,7 +207,7 @@ def scrape_logged_in(self, get_employees = True, close_on_complete = True): section_id = 3 #section ID is no longer needed, we are using class name now. #grid = driver.find_elements_by_tag_name("section")[section_id] - grid = driver.find_element_by_class_name("artdeco-card.p4.mb3") + grid = driver.find_element_by_class_name("artdeco-card.p5.mb4") print(grid) descWrapper = grid.find_elements_by_tag_name("p") if len(descWrapper) > 0: @@ -236,6 +238,13 @@ def scrape_logged_in(self, get_employees = True, close_on_complete = True): elif txt == 'Specialties': self.specialties = "\n".join(values[i+x_off].text.strip().split(", ")) + grid = driver.find_element_by_class_name("mt1") + spans = grid.find_elements_by_tag_name("span") + for span in spans: + txt = span.text.strip() + if "See all" in txt and "employees on LinkedIn" in txt: + self.headcount = int(txt.replace("See all", "").replace("employees on LinkedIn", "").strip()) + driver.execute_script("window.scrollTo(0, Math.ceil(document.body.scrollHeight/2));") @@ -346,6 +355,7 @@ def __repr__(self): _output['founded'] = self.founded _output['affiliated_companies'] = self.affiliated_companies _output['employees'] = self.employees + _output['headcount'] = self.headcount return json.dumps(_output).replace('\n', '') diff --git a/linkedin_scraper/person.py b/linkedin_scraper/person.py index eb1a3bd..60f6237 100644 --- a/linkedin_scraper/person.py +++ b/linkedin_scraper/person.py @@ -5,7 +5,7 @@ from selenium.webdriver.support import expected_conditions as EC from .objects import Experience, Education, Scraper, Interest, Accomplishment, Contact import os -from linkedin_scraper import selectors +import selectors class Person(Scraper):