完整代码:
from bs4 import BeautifulSoup import re import urllib.request, urllib.error import xlwt def main(): baseurl = 'https://movie.douban.com/top250?start=' datalist = getDate(baseurl) savepath = '豆瓣电影Top250.xls' saveDate(datalist, savepath) findLink = re.compile(r'<a href="(.*?)">') findImage = re.compile(r'<img.*?src="(.*?)"', re.S) findTitle = re.compile(r'<span class="title">(.*?)</span>') findRating = re.compile(r'span class="rating_num" property="v:average">(.*?)</span>') findJudge = re.compile(r'<span>(\d*)人评价</span>') findInq = re.compile(r'<span class="inq">(.*?)</span>') findBd = re.compile(r'<p class="">(.*?)</p>', re.S) def getDate(baseurl): datalist = [] for i in range(0, 10): url = baseurl + str(i*25) html = askURL(url) # 逐一解析 soup = BeautifulSoup(html, 'html.parser') for item in soup.find_all('div', class_='item'): data = [] # 保存一部电影的所有信息 item = str(item) link = re.findall(findLink, item)[0] data.append(link) imgSrc = re.findall(findImage, item)[0] data.append(imgSrc) title = re.findall(findTitle, item) if len(title) == 2: ctitle = title[0] data.append(ctitle) otitle = title[1].replace("/","") data.append(otitle) else: data.append(title[0]) data.append(' ') rating = re.findall(findRating, item)[0] data.append(rating) judegeNum = re.findall(findJudge, item)[0] data.append(judegeNum) inq = re.findall(findInq, item) if len(inq) != 0: inq = inq[0].replace("。","") data.append(inq) else: data.append(" ") bd = re.findall(findBd, item)[0] bd = re.sub('<br(\s+)?/>(\s+)?'," ",bd) bd = re.sub('/', " ", bd) data.append(bd.strip()) datalist.append(data) return datalist def askURL(url): header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36' } request = urllib.request.Request(url, headers=header) html = "" try: response = urllib.request.urlopen(request) html = response.read().decode("utf-8") # print(html) except urllib.error.URLError as e: if hasattr(e, "code"): print(e.code) if hasattr(e, "reason"): print(e) return html def saveDate(datalist, savepath): book = xlwt.Workbook(encoding='utf-8', style_compression=0) sheet = book.add_sheet('豆瓣电影Top250') col = ('电影详情链接', '图片链接', '影片中文名', '影片外国名', '评分', '评价人数', '概况', '相关信息') for i in range(0, 8): sheet.write(0, i, col[i]) for i in range(0, 250): print("第%d条" %(i+1)) data = datalist[i] for j in range(0, 8): sheet.write(i+1, j, data[j]) book.save(savepath) if __name__ == '__main__': main() print("爬取完成!")
直接运行会生成“豆瓣电影Top250.xls”文件,格式可以自己整理一下。
代码目前有效,不排除以后豆瓣改规则,源码可能会失效。
可以直接下载我爬取的豆瓣电影TOP250:
https://pan.xuzai.com/s/nSLdinyY257Akd4
源码文件:
https://pan.xuzai.com/s/yz2pWj74jwsHmem
.好东西,周松松来拜访
欢迎访问,会定期回访。