python-爬取网站数据

article/2025/10/22 13:32:46

一、安装jar

找到python解释解释器的安装根目录，执行以下命令，

pip install requests
#如果这个BeautifulSoup这个安装不上，可以换成pip install BeautifulSoup4试试
pip install BeautifulSoup

二、肯定就是敲代码了

import urllib.request
from bs4 import BeautifulSoup
def handle_request(url):#反爬headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:106.0) Gecko/20100101 Firefox/106.0',}#请求request=urllib.request.Request(url,headers=headers)return request
def parse_content(content,fp):# 生成soup对象 lxml类型 soup已经拿到网页所有数据soup=BeautifulSoup(content,'lxml')#分析网页，获取自己想要的数据 通过select 来获取指定的数据name_list=soup.select("h3")content_list=soup.select("div .des")datelist=[]#把数据进行循环并且格式化数据for x,y in zip(name_list,content_list):#去空格name=x.get_text().strip('\n /')content=y.get_text().strip('\n /')#格式数据dict={"书名":name,"内容":content}datelist.append(dict)#print(name+":"+conten)if datelist=="":print("没有打印内容")return#写入到磁盘，把数据进行持久化fp.write(str(datelist))#关闭文件流fp.close()def main():# 打开文件fp = open('作者合集.txt','w',encoding='utf8')url = 'https://www.shicimingju.com/hecheng/index.html'# 构建请求对象request = handle_request(url)# 发送请求，得到响应content=urllib.request.urlopen(request).read().decode('utf8')# 解析内容即可parse_content(content,fp)if __name__ == '__main__':main()