[python]微信公众号文章爬取
需求
爬取一些微信公众号的文章
数据来源
1.搜狗微信搜索,可以搜索微信公众号文章,但只能显示该公众号最近十篇的文章
2.通过个人微信公众号中的素材管理,查看其他微信公众号文章
步骤
1.手动从网站上获取cookie通过cookie登陆
2.从请求url中获取token
3.拼接参数请求https://mp.weixin.qq.com/cgi-bin/searchbiz
获取公众号的fakeid也就是biz
4.拼接参数请求https://mp.weixin.qq.com/cgi-bin/appmsg?
获取文章列表信息
5.通过文章url爬取文章
通过这种方式是没办法得到阅读量和点赞数的,因为网页打开公众号文章是没有阅读数和点赞数的
代码
github仓库地址
import requests
import json
import re
import timeclass WeChatCrawler():def __init__(self, wxList):self.wxList = wxListself.cookies = self.__getCookiesFromText()self.token = self.__getToken()self.headers = {"HOST": "mp.weixin.qq.com","User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0"}self.searchBizParam = {'action': 'search_biz','token': self.token,'lang': 'zh_CN','f': 'json','ajax': '1','query': '','begin': '0','count': '5',}self.getMsgListParam = {'token': self.token,'lang': 'zh_CN','f': 'json','ajax': '1','action': 'list_ex','begin': '0','count': '5','query': '','fakeid': '','type': '9'}def __getCookiesFromText(self):# 手动获取cookiewith open('cookie.txt', 'r', encoding='utf-8') as f:cookieStr = f.read()# 处理cookieStr格式转化成jsoncookieStr = "{\"" + cookieStr + "\"}"cookieStr = cookieStr.replace("rewardsn=;", "").replace(";", "\",\"").replace("=", "\":\"").replace("\":\"\"", "=\"").replace(' ', '')# print(cookieStr)cookies = json.loads(cookieStr)return cookiesdef __getToken(self):url = 'https://mp.weixin.qq.com'response = requests.get(url=url, cookies=self.cookies)token = re.findall(r'token=(\d+)', str(response.url))[0]return tokendef __getWXFakeid(self, wx):searchUrl = 'https://mp.weixin.qq.com/cgi-bin/searchbiz?'self.searchBizParam['query'] = wxsearchResponse = requests.get(searchUrl, cookies=self.cookies, headers=self.headers, params=self.searchBizParam)fakeid = searchResponse.json().get('list')[0].get('fakeid')return fakeiddef __getWXMsgCnt(self, fakeId):self.getMsgListParam['fakeid'] = fakeIdappmsgUrl = 'https://mp.weixin.qq.com/cgi-bin/appmsg?'appmsgResponse = requests.get(appmsgUrl, cookies=self.cookies, headers=self.headers,params=self.getMsgListParam)wxMsgCnt = appmsgResponse.json().get('app_msg_cnt')return wxMsgCntdef __getWXMsgList(self, fakeId):appmsgUrl = 'https://mp.weixin.qq.com/cgi-bin/appmsg?'wxMsgCnt = self.__getWXMsgCnt(fakeId)if wxMsgCnt is not None:pages = int(wxMsgCnt) // 5begin = 0for _ in range(pages):print('====翻页====', begin)self.getMsgListParam['begin'] = str(begin)msgListResponse = requests.get(appmsgUrl, cookies=self.cookies, headers=self.headers,params=self.getMsgListParam)msgList = msgListResponse.json().get('app_msg_list')for item in msgList:# todo moremsgLink = item.get('link')print(msgLink)msgTitle = item.get('title')print(msgTitle)begin += 5time.sleep(3)def runCrawler(self):fakeIds = list(map(self.__getWXFakeid, self.wxList))list(map(self.__getWXMsgList, fakeIds))if __name__ == '__main__':# examplewxList = ['量子位', ]wc = WeChatCrawler(wxList)wc.runCrawler()