1.启动pyspider
2.新建一个项目
3.代码
4. 注意事项:网址什么的都变了
5.存储到MongoDB,
注意这个地方我错了三次
6.在tableau可视化才发现错误的1,2
之后就能可视化了,本次实验是个半成品。后期会补充。
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2018-10-22 17:07:31
# Project: TripAdvisorfrom pyspider.libs.base_handler import *
import pymongoclass Handler(BaseHandler):crawl_config = {}client = pymongo.MongoClient('localhost')db = client['trip']@every(minutes=24 * 60)def on_start(self):self.crawl('https://www.tripadvisor.cn/Attractions-g186338-Activities-c47-oa30-London_England.html#FILTERED_LIST', callback=self.index_page)@config(age=10 * 24 * 60 * 60)def index_page(self, response):for each in response.doc('#ATTR_ENTRY_194299 > div.attraction_clarity_cell > div > div > div.listing_info > div.listing_title > a').items():self.crawl(each.attr.href, callback=self.detail_page)next = response.doc('#FILTERED_LIST > div.al_border.deckTools.btm > div > div > a.nav.next.rndBtn.ui_button.primary.taLnk ').attr.hrefself.crawl(next,callback=self.index_page)@config(priority=2)def detail_page(self, response):url = response.urlname = response.doc('#taplc_trip_planner_breadcrumbs_0 > ul > li:nth-child(6)').text()phone = response.doc('#taplc_location_detail_contact_card_ar_responsive_0 > div.contactInfo > div.contact > div.contactType.phone.is-hidden-mobile > div').text()durating = response.doc('#taplc_location_detail_reviews_card_0 > div.section.rating > a.seeAllReviews').text()score =response.doc('#taplc_location_detail_reviews_card_0 > div.section.rating > span').text()return {"url": url,"name": name,"phone":phone,"durating":durating,"score":score}def on_result(self,result):if result:self.save_to_mongo(result)def save_to_mongo(self,result):if self.db['lodon'].insert(result):print('saved to mongo',result)