1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
| import requests from bs4 import BeautifulSoup import json header = { "User-Agent" :"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36 Edg/99.0.1150.36" } houses=[] for page in range(1,11): url =f"https://www.fangstar.com/rentls/pg{page}/" h = requests.get(url=url,headers=header) html = BeautifulSoup(h.text,"html.parser") box = html.find_all(attrs={"class":"rhs-list-item"}) for i in range(0,len(box)): titleModel = box[i].find(attrs={"class": "base-info"}).find(attrs={"class": "dic-name nowrap"}) title = titleModel["title"] house_typeModel = box[i].find(attrs={"class": "base-info"}).find_all("span")[1] house_type = house_typeModel.string areaModel = box[i].find(attrs={"class": "base-info"}).find_all("span")[2] area = areaModel.string floorModel = box[i].find(attrs={"class": "base-info"}).find_all("span")[3] floor = floorModel.string yearModel = box[i].find(attrs={"class": "base-info"}).find_all("span")[4] year = yearModel.string priceModel = box[i].find(attrs={"class": "total-price"}).find_all("span") price = "{}{}".format(priceModel[0].string, priceModel[1].string) addressModel = box[i].find(attrs={"class": "location-wrap nowrap"}).find_all("a") addressModelSpan = box[i].find(attrs={"class": "location-wrap nowrap"}).find_all("span") address = "{0}-{1}-{2}".format(addressModel[0].string, addressModel[1].string, addressModelSpan[0].string) authorTimeModel = box[i].find(attrs={"class": "post-info"}).find_all("span") author = authorTimeModel[0].string times = authorTimeModel[1].string house = {"标题": title, "房屋类型": house_type, "面积": area, "楼层": floor, "建筑年限": year, "租金": price, "地点": address, "作者": author, "发布时间": times} houses.append(house) print(f"第{page}页") print(house) print(houses) j = { "data" :houses } with open("house_homework.json","a",encoding="UTF-8") as f: out = json.dumps(j,ensure_ascii=False) f.write(out)
|