1:安装python
只有英文版
2:安装扩展
直接cmd安装
pip install xlwt
pip install bs4
3:运行代码
import re import xlwt import requests from bs4 import BeautifulSoup def getHtml(url): headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:56.0) Gecko/20100101 Firefox/56.0'} page = requests.get(url,headers = headers) html =page.text return html if __name__=='__main__': Workbook = xlwt.Workbook() sheet = Workbook.add_sheet('豆瓣读书Top250') sheet.write(2,2,'书名') sheet.write(2,3,'作者') sheet.write(2,4,'译者') sheet.write(2,5,'出版单位') sheet.write(2,6,'出版时间') sheet.write(2,7,'定价') sheet.write(2,8,'豆瓣评分') sheet.write(2,9,'评价人数') sheet.write(2,10,'短评') i = 3 j = 3 k = 3 m = 3 for page in range(0,250,25): url = 'https://book.douban.com/top250?start={0}'.format(page) html = getHtml(url) Soup = BeautifulSoup(html,'html.parser') names = Soup.find_all('div',class_ = 'pl2') for name in names: book = name.find('a') book = book.text.strip() book = book.replace(' ','') sheet.write(i,2,book) i += 1 Infos = Soup.find_all('p',class_ = 'pl') for Info in Infos: r = 1 authorinfo = Info.text authors = authorinfo.split('/') if len(authors) < 4: sheet.write(j,5,authors[0]) sheet.write(j,6,authors[1]) sheet.write(j,7,authors[2]) j += 1 continue sheet.write(j,3,authors[0]) if authorinfo.count('/') == 4: sheet.write(j,4,authors[r]) r += 1 sheet.write(j,5,authors[r]) sheet.write(j,6,authors[r+1]) sheet.write(j,7,authors[r+2]) j += 1 rating_nums = Soup.find_all('div',class_ = 'star clearfix') for rating in rating_nums: star = rating.find_all('span') sheet.write(k,8,star[1].text) reg = r'\d+' vote = re.findall(reg,star[2].text) sheet.write(k,9,vote) k += 1 quotes = Soup.find_all('p',class_ = 'quote') for quote in quotes: sheet.write(m,10,quote.text) m += 1 Workbook.save('豆瓣读书Top250.xls')
直接运行会生成“豆瓣读书Top250.xls”文件,格式可以自己整理一下。
代码目前有效,不排除以后豆瓣改规则,源码可能会失效。
可以直接下载我爬取的豆瓣读书TOP250:
https://pan.xuzai.com/s/ToP7sa2nJwWdzNB
源码文件:
https://pan.xuzai.com/s/F9DRjwankpF4oac