使用lxml查询

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
from lxml import*
from lxml import etree
import requests
import json
import re
header={
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36 Edg/99.0.1150.36"
}
def getOnePage(url):
try:
res = requests.get(url, headers=header)
if res.status_code == 200:
return res.text
return None
except Exception:
return None

def parseOnePage(html):
selector_html = etree.HTML(html)
items = selector_html.xpath('//tr[@class="item"]')
for item in items:
book_infos = item.xpath("td/p/text()")[0]
book_sss = item.xpath("td/div[@class='star clearfix']/span/text()")
a = re.findall('\d+\.?\d*',book_sss[1])
yield {
"name": item.xpath("td/div/a/@title")[0],
"url": item.xpath("td/div/a/@href")[0],
"author": book_infos.split("/")[0],
"publisher": book_infos.split("/")[-3],
"date": book_infos.split("/")[-2],
"price": book_infos.split("/")[-1],
"grade":book_sss[0],
"numberOne":a[0]
}
def save(content) :
with open("322031608冯越.txt", "at", encoding='utf-8') as f:
f.write(json.dumps(content, ensure_ascii=False)+"\n")

def getTop250(url) :
html = getOnePage(url)
for item in parseOnePage(html):
print(item)
save(item)


urls =["https://book.douban.com/top250?start={}".format(str(i))for i in range(0,250,25)]
for url in urls:
getTop250(url)
print("爬取中")