1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52
| from lxml import* from lxml import etree import requests import json import re header={ "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36 Edg/99.0.1150.36" } def getOnePage(url): try: res = requests.get(url, headers=header) if res.status_code == 200: return res.text return None except Exception: return None
def parseOnePage(html): selector_html = etree.HTML(html) items = selector_html.xpath('//tr[@class="item"]') for item in items: book_infos = item.xpath("td/p/text()")[0] book_sss = item.xpath("td/div[@class='star clearfix']/span/text()") a = re.findall('\d+\.?\d*',book_sss[1]) yield { "name": item.xpath("td/div/a/@title")[0], "url": item.xpath("td/div/a/@href")[0], "author": book_infos.split("/")[0], "publisher": book_infos.split("/")[-3], "date": book_infos.split("/")[-2], "price": book_infos.split("/")[-1], "grade":book_sss[0], "numberOne":a[0] } def save(content) : with open("322031608冯越.txt", "at", encoding='utf-8') as f: f.write(json.dumps(content, ensure_ascii=False)+"\n")
def getTop250(url) : html = getOnePage(url) for item in parseOnePage(html): print(item) save(item)
urls =["https://book.douban.com/top250?start={}".format(str(i))for i in range(0,250,25)] for url in urls: getTop250(url) print("爬取中")
|