基于Python猫眼票房TOP100电影数据抓取
- 使用requests库抓取数据
- BeatifulSoup解析URL,将数据存入csv文件
- 处理数据,生成图表
本次爬取数据使用Python语言在jupyter notebook上爬取的,并使用数据生成了一些图表。
使用requests库抓取数据
本段代码提取(www.maoyan.com/board/4) 网页中所有信息.
import requests as req
import re
from bs4 import BeautifulSoup as bs
import time as tidef link(url):header = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36 Edg/94.0.992.50","cookie" : "__mta=49569177.1634546762447.1635751930589.1635755537866.189; __mta=49569177.1634546762447.1635755537866.1636089904755.190; _lxsdk_cuid=17c92944154c8-0976821c9ff2ba-513c1f42-1fa400-17c9294415486; _lx_utm=utm_source=Baidu&utm_medium=organic; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1634709537,1634709791,1634709795,1635568974; __mta=246700987.1634548728665.1634549353008.1635568973951.3; uuid_n_v=v1; uuid=2B2B05903C7611EC9D395F5B080ABF0B13E32B82F40F483EA9968669224339F7; _csrf=04f1258f2b355ec8fc0e1f4b729b4545d23fc1610830265d79846b5d5b3fc7cf; _lxsdk=2B2B05903C7611EC9D395F5B080ABF0B13E32B82F40F483EA9968669224339F7; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1636089905; _lxsdk_s=17cee6e2ca2-e12-d9c-bd9||14"}res = req.get(url,headers = header)if res.status_code == 200:return bs(res.text,"lxml")return None
上面的代码中,cookie需要自己去更改。
打开猫眼界面(www.maoyan.com/board/4),cookie方式如图所示
BeatifulSoup解析URL,将数据存入csv文件
Title = []
Director = []
Actors = []
Rating = []
Income = []
Duration = []
Type = []
Region = []
Release = []
main_Type = []
Score = []for i in range(0,100,10):url = "https://maoyan.com/board/4?offset=" + str(i)movies = link(url).find_all("dd")for i in movies:img = i.find("img",class_ = "board-img").get("data-src")num = i.find("i").textname = i.find("a").get("title")actor = re.findall("主演:(.*)",i.find("p",class_ = "star").text)[0]release = re.findall("上映时间:(.*)",i.find("p",class_ = "releasetime").text)[0].split('(')[0].split('-')[0] url1 = "https://maoyan.com" + i.find("p",class_ = "name").a.get("href")score = i.find("i",class_ = "integer").text + i.find("i",class_ = "fraction").textmovie = link(url1) ti.sleep(2)director = movie.find("a",class_= "name").text.replace("\n","").replace(" ","")type1 = movie.find("li",class_="ellipsis").text.replace("\n"," ")type2 = movie.find("a",class_="text-link").text.strip()last_income = movie.find_all("div",class_="mbox-name")duration = re.findall(r'\d+',movie.find_all("li",class_="ellipsis")[1].text)region = movie.find_all("li",class_="ellipsis")[1].text.split('/')[0].strip()Region.append(region)Title.append(name)Director.append(director)Actors.append(actor)Rating.append(num)Release.append(release)main_Type.append(type2)Score.append(score)Duration.append(duration[0]+"分钟") Type.append(type1)if(movie.find("div",class_="mbox-name")):income = Nonenum_temp = 0for income in last_income:num_temp = num_temp+1 passif income:content = income.getText()income_temp = movie.find_all("div",class_="mbox-name")[num_temp-2].textelse:Income.append("暂无")continueif(income_temp=="暂无"):Income.append(income_temp)else:Income.append(income_temp+"万")print("Grab complete")
将数据存入csv文件
import pandas as pd
df = pd.DataFrame({'Rating':Rating,'Title':Title,'Director':Director,'Actors':Actors,'Income':Income,'Score':Score,'Duration':Duration,'Release':Release,'Type':Type,'Region':Region,'Main Type':main_Type})
df = df.set_index('Rating')
df.to_csv('./test.csv',encoding='utf_8_sig')
print('ok')
csv文件
处理数据,生成图表
读取csv文件
import csv
from datetime import datetime
import matplotlib.pyplot as plt
import pandas as pd
import seaborn.apionly as sns
from pylab import *
filename = 'test.csv'
data = pd.read_csv(filename)
各个区域电影占比的饼状图
readtxt = (",".join( data.Region.values))
readlist = readtxt.split(',')
dict_region = {}for every_region in readlist:if every_region in dict_region:dict_region[every_region] += 1else:dict_region[every_region] = 1region = []
num_region = []
region_value = 0
for key in dict_region:if(dict_region[key]<4):region_key = "其他"region_value += dict_region[key]else:region.append(key) num_region.append(dict_region[key])region.append(region_key)
num_region.append(region_value) plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.figure(figsize = (20,8),dpi = 100)color = ['#b1d1fc','#75bbfd','#2976bb','#acc2d9','#a2cffe','#c5c9c7','#276ab3','#b7c9e2','#49759c']
plt.title('Region')
explode = [0.04,0.05,0.04,0.04,0.04,0.04,0.04,0.04,0.04,0.04,0.04]
plt.pie(x = num_region,labels = region,colors = color,autopct = '%0.1f%%',shadow = False,pctdistance = 0.8,explode = explode,startangle = 286)plt.legend(loc = "upper right",bbox_to_anchor = (0.7, 0, 0.5, 1))
plt.savefig('./region.png')
plt.show()
饼状图展示
代码资源已经上传
https://download.csdn.net/download/weixin_43165086/37368600


















