正则表达式抓取图片

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import re
import urllib.request
import requests
import os
import urllib.error
from bs4 import BeautifulSoup

def getHtmlText(url):
res_date = requests.get(url=url)
if res_date.status_code == requests.codes.OK:
res_date.encoding = "utf-8"
htmlText = res_date.text
return htmlText
else:
print("请求状态异常......")
return None


def parseHtml(htmlText):
patten = re.compile('<a href="(.*?)".*?>[\s]*<img class="lazy" data-original="(.*?)".*? title="(.*?)" /></a>')
imgList = []
listInfos = patten.findall(htmlText)
if len(listInfos)>0:
for info in listInfos:
imgList.append(info[1])
return imgList
else:
print("数据列表为空,解析失败.......")
return None


def downloadImgList(imgsrc):
if not os.path.exists("D://322031608IMG"):
os.makedirs("D://322031608IMG")
try:
imgFileName = imgsrc.split("/")[-1]
urllib.request.urlretrieve(imgsrc,"D://322031608IMG//"+imgFileName)
except urllib.error.HTTPError as a:
print("下载图片出现错误.......")


if __name__ == "__main__":
prr_url = "https://www.toopic.cn"
base_url = "https://www.toopic.cn/dnbz/?q=--81--.html"
urlList = []
for i in range(1,21):
base_urls = base_url+"&page={}".format(i)
urlList.append(base_urls)
jindu = 0
for url in urlList:
html=getHtmlText(url)
dataSet=parseHtml(html)
print('\n\r[%-100s]%.2f%%' % ('=' * int(jindu), jindu), end='')
jindu = jindu + 5
print("")
for info in dataSet:
imgsrc = prr_url + info
print(imgsrc)
downloadImgList(imgsrc)
print("全部下载完毕....")