爬取网站
美桌(http://www.win4000.com)
由于该网站并没有robots协议,因此可以任意爬取。
技术路线
requests+re
除此之外用到了pypinyin库用于进行名字到拼音的转换
代码
# -*- coding: utf-8 -*-
"""
Created on Mon Oct 16 20:32:27 2017
@author: 望
"""
import requests
import re
import os
from pypinyin import pinyin, lazy_pinyin
def getHTMLText(url):try:r = requests.get(url,timeout=30)r.raise_for_status()r.encoding = r.apparent_encodingreturn r.textexcept:print("")def getPageUrls(text,name):re_pageUrl=r'href="(.+)">\s*<img src="(.+)" alt="'+namereturn re.findall(re_pageUrl,text)def downPictures(text,root,name):pageUrls=getPageUrls(text,name) titles=re.findall(r'alt="'+name+r'(.+)" ',text)for i in range(len(pageUrls)):pageUrl=pageUrls[i][0]path = root + titles[i]+ "//"if not os.path.exists(path):os.mkdir(path)if not os.listdir(path): pageText=getHTMLText(pageUrl)totalPics=int(re.findall(r'<em>(.+)</em>)',pageText)[0])downUrl=re.findall(r'href="(.+?)" class="">下载图片',pageText)[0]cnt=1;while(cnt<=totalPics):picPath=path+str(cnt)+".jpg"r=requests.get(downUrl)with open(picPath,'wb') as f:f.write(r.content)f.close()print('{} - 第{}张下载已完成\n'.format(titles[i],cnt))cnt+=1 nextPageUrl=re.findall(r'href="(.+?)">下一张',pageText)[0]pageText=getHTMLText(nextPageUrl)downUrl=re.findall(r'href="(.+?)" class="">下载图片',pageText)[0]returndef main(): name=input("请输入你喜欢的明星的名字:")nameUrl="http://www.win4000.com/mt/"+''.join(lazy_pinyin(name))+".html" try:text=getHTMLText(nameUrl)if not re.findall(r'暂无(.+)!',text): root = "D://pics//"+name+"//"if not os.path.exists(root):os.mkdir(root)downPictures(text,root,name)try:nextPage=re.findall(r'next" href="(.+)"',text)[0]while(nextPage):nextText=getHTMLText(nextPage)downPictures(nextText,root,name) nextPage=re.findall(r'next" href="(.+)"',nextText)[0]except IndexError:print("已全部下载完毕")except TypeError:print("不好意思,没有{}的照片".format(name))returnif __name__ == '__main__':main()