selenium模拟登陆拉钩网
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import os, json, time
from urllib import parse
from lxml import etree
from fake_useragent import UserAgent
from pwd import username,password
from pymongo import MongoClientua = UserAgent()
#搜索的关键字
keywords_ls = ['python','java','web','c']
#搜索的热门城市
citys_ls = ['北京','上海','深圳','广州','杭州','成都','南京','武汉','西安','厦门','长沙','苏州','天津']class LaGouselenium():def __init__(self,keywords_ls=keywords_ls,citys_ls=citys_ls):self.keywords_ls=keywords_lsself.citys_ls=citys_lsself.crawl_city=[] #已经爬取的城市#断点续传crawledCityPath='./lagou_crawled_city.json'if os.path.exists(crawledCityPath):with open(crawledCityPath,'r',encoding='utf8') as f:ls = json.load(f)self.crawled_city=lsself.col = MongoClient()['selenium']['LGW']#抓取条数print('已抓取',self.col.count_documents({}))#生成游览对象self.chrome = webdriver.Chrome('D:\data\chromedriver\chromedriver.exe')#隐式等待三秒self.chrome.implicitly_wait(3)def login(self):loginUrl='https://passport.lagou.com/login/login.html'self.chrome.get(loginUrl)#账号 密码 登录self.chrome.find_element_by_xpath('//form[@class="active"]/div[@data-propertyname="username"]/input').send_keys(username)self.chrome.find_element_by_xpath('//form[@class="active"]/div[@data-propertyname="password"]/input').send_keys(password)self.chrome.find_element_by_xpath('//form[@class="active"]/div[@data-propertyname="submit"]/input').click()#遇到验证码后 阻塞 手动验证c = input('如果出现验证码 手动验证后 回车, 否则直接回车')def crawl(self):#点击红包页面init=Truefor k in self.keywords_ls:#爬取哪一个关键字的信息print(k)#https://www.lagou.com/jobs/list_java/p-city_2?px=default#filterBox#最新发布排序url = 'https://www.lagou.com/jobs/list_{}/p-city_0?px=new&#filterBox'.format(k)self.chrome.get(url)if 'sec.lagou.com' in self.chrome.current_url: #301重定向input('欢迎进入验证码页面!手动处理完成后回车')#红包页面只点一次if init:self.chrome.find_element_by_class_name('body-btn').click()init = Falsefor city in self.citys_ls:print(city)#维护断点续传if (k,city) in self.crawl_city:continue#点击城市self.chrome.find_element_by_link_text(city).click()if 'sec.lagou.com' in self.chrome.current_url: #301重定向input('欢迎进入验证码页面!手动处理完成后回车')#循环翻页while 1:#总页面totalpage = int(self.chrome.find_element_by_class_name('totalNum').text.strip())#当前页面curpage = int(self.chrome.find_element_by_class_name('curNum').text.strip())print('翻页',curpage,'/',totalpage)self.parseListPage(k,city)if curpage == totalpage:breakelse:self.chrome.find_element_by_class_name('next').click()if 'sec.lagou.com' in self.chrome.current_url: #301重定向input('欢迎进入验证码页面!手动处理完成后回车')#一定要刷新 否则下一次循环 无法定位城市self.chrome.refresh()#这个字段的城市已经爬取self.crawl_city.append((k,city))with open('./lagou_crawled_city.json','w',encoding='utf8') as f:print('已爬+++++++++++++',city,k)#存储json.dump(self.crawl_city,f,ensure_ascii=False)self.checkDetailPage()def handleTime(self,timestr):passdef parseListPage(self,keyword,city):WebDriverWait(self.chrome,3).until(EC.presence_of_all_elements_located(('class name','con_list_item')))time.sleep(1)for ele in self.chrome.find_elements_by_class_name('con_list_item'):item = {"keyword":keyword,'city':city}try:url = ele.find_element_by_xpath('./div/div/div/a').get_attribute('href')jd_id = url.split('.html')[0].split('/')[-1].strip()print(jd_id)except:returnif self.col.find_one({'jd_id':jd_id}):print('重复',jd_id)#continue #如果是初次爬取 continue 后期增量爬取 breakbreakitem['jd_id'] = jd_idtext = ele.find_element_by_xpath('./div').textls = text.split('\n')title = ls[0]addr =ls[1]#pubtime =self.handleTime(ls[2])item['title'] =titleitem['addr'] =addr#其他字段 自行添加salaryRange=ls[3].split(' ')[0].split('-')item['salaryRange']=salaryRangeprint(item,'***********')self.col.insert_one(item)def checkDetailPage(self):'''在数据库中遍历 补全信息'''for item in self.col.find({'job_description':False}):url = 'https://www.lagou.com/jobs/%s.html'%item['jd_id'].strip()self.chrome.get(url)des = self.chrome.find_element_by_class_name('job-detail').text.strip()self.col.update_one({'jd_id':item['jd_id']},{'$set':{'job_description':des}})if __name__ == "__main__":lagou = LaGouselenium(keywords_ls,citys_ls)lagou.login()lagou.crawl()
效果演示