爬取房星网租房数据并打包成json文件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import requests
from bs4 import BeautifulSoup
import json
header = {
"User-Agent" :"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36 Edg/99.0.1150.36"
}
houses=[]
for page in range(1,11):
url =f"https://www.fangstar.com/rentls/pg{page}/"
h = requests.get(url=url,headers=header)
html = BeautifulSoup(h.text,"html.parser")
box = html.find_all(attrs={"class":"rhs-list-item"})
for i in range(0,len(box)):
titleModel = box[i].find(attrs={"class": "base-info"}).find(attrs={"class": "dic-name nowrap"})
title = titleModel["title"]
house_typeModel = box[i].find(attrs={"class": "base-info"}).find_all("span")[1]
house_type = house_typeModel.string
areaModel = box[i].find(attrs={"class": "base-info"}).find_all("span")[2]
area = areaModel.string
floorModel = box[i].find(attrs={"class": "base-info"}).find_all("span")[3]
floor = floorModel.string
yearModel = box[i].find(attrs={"class": "base-info"}).find_all("span")[4]
year = yearModel.string
priceModel = box[i].find(attrs={"class": "total-price"}).find_all("span")
price = "{}{}".format(priceModel[0].string, priceModel[1].string)
addressModel = box[i].find(attrs={"class": "location-wrap nowrap"}).find_all("a")
addressModelSpan = box[i].find(attrs={"class": "location-wrap nowrap"}).find_all("span")
address = "{0}-{1}-{2}".format(addressModel[0].string, addressModel[1].string, addressModelSpan[0].string)
authorTimeModel = box[i].find(attrs={"class": "post-info"}).find_all("span")
author = authorTimeModel[0].string
times = authorTimeModel[1].string
house = {"标题": title, "房屋类型": house_type, "面积": area, "楼层": floor, "建筑年限": year, "租金": price,
"地点": address, "作者": author, "发布时间": times}
houses.append(house)
print(f"第{page}页")
print(house)
print(houses)
j = {
"data" :houses
}
with open("house_homework.json","a",encoding="UTF-8") as f:
out = json.dumps(j,ensure_ascii=False)
f.write(out)

测试

1
2
3
4
5
6
7
import json
data = []
with open("house_homework.json","r",encoding="utf-8") as f:
json_data = json.load(f)
data=json_data["data"]
for i in range(0,len(data)):
print(data[i])

爬取图片网站

获取所有图片的网页路径

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import requests
from bs4 import BeautifulSoup
import json

s = 1
box =[]
for page in range(0,84,42):
print(f"第{s}页")
s+=1
ret = requests.get(url=f"网站&pid={page}")
https = BeautifulSoup(ret.text,"html.parser")
for i in https.find_all("span",attrs={"class":"thumb"}):
for y in i("a"):
str = "https://主网站{}".format(y['href'])
box.append(str)
print(str)

out = []
for num in box:
s = requests.get(url=num)
getImage = BeautifulSoup(s.text, "html.parser").find_all("img", attrs={"id": "image"})
en = f'https:{getImage[0]["src"]}'
print(en)
out.append(en)

j = {
"data":out
}
with open("322031608/my_wife.json","a") as f:
f.write(json.dumps(j))

判断图片后缀名下载

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
import requests
import json

data =[]
with open("322031608/my_wife.json","r") as f:
j = json.load(f)
data = j["data"]
print(data)
y = 253
for i in data:
s = requests.get(url=i)
imgName=""
if ".gif" in i:
imgName = "{}.gif".format(y)
elif ".png" in i:
imgName = "{}.png".format(y)
else:
imgName = "{}.jpg".format(y)


u = "images/{}".format(imgName)
with open(u, "wb") as f:
print("正在下载///")
f.write(s.content)
y = y+1
print("下载完毕!!")