1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62
| import re import urllib.request import requests import os import urllib.error from bs4 import BeautifulSoup
def getHtmlText(url): res_date = requests.get(url=url) if res_date.status_code == requests.codes.OK: res_date.encoding = "utf-8" htmlText = res_date.text return htmlText else: print("请求状态异常......") return None
def parseHtml(htmlText): patten = re.compile('<a href="(.*?)".*?>[\s]*<img class="lazy" data-original="(.*?)".*? title="(.*?)" /></a>') imgList = [] listInfos = patten.findall(htmlText) if len(listInfos)>0: for info in listInfos: imgList.append(info[1]) return imgList else: print("数据列表为空,解析失败.......") return None
def downloadImgList(imgsrc): if not os.path.exists("D://322031608IMG"): os.makedirs("D://322031608IMG") try: imgFileName = imgsrc.split("/")[-1] urllib.request.urlretrieve(imgsrc,"D://322031608IMG//"+imgFileName) except urllib.error.HTTPError as a: print("下载图片出现错误.......")
if __name__ == "__main__": prr_url = "https://www.toopic.cn" base_url = "https://www.toopic.cn/dnbz/?q=--81--.html" urlList = [] for i in range(1,21): base_urls = base_url+"&page={}".format(i) urlList.append(base_urls) jindu = 0 for url in urlList: html=getHtmlText(url) dataSet=parseHtml(html) print('\n\r[%-100s]%.2f%%' % ('=' * int(jindu), jindu), end='') jindu = jindu + 5 print("") for info in dataSet: imgsrc = prr_url + info print(imgsrc) downloadImgList(imgsrc) print("全部下载完毕....")
|