scrapy 练手案例

spider

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import scrapy,re
from ..items import DangdangItem

class PythonbookSpider(scrapy.Spider):
name = "pythonbook"
start_urls = ["http://search.dangdang.com/?key=python&act=input"]
p = 1
def parse(self, response):
disc = response.xpath(".//ul[@class='bigimg']/li")
item = DangdangItem()
for date in disc:
item['name'] = date.xpath("./a/@title").extract_first()
price = date.xpath(".//span[@class='search_now_price']").extract_first()
price = re.findall(".*?([0-9]*\.[0-9]*)",price)
if(price[0]):
item['price'] = price[0]
else:
item['price'] = None
item['pic'] = date.xpath(".//img/@data-original|.//img/@src").extract_first()
item['author'] = date.xpath(".//p[@class='search_book_author']/span[1]/a[1]/@title").extract_first()
item['publisher'] = date.xpath(".//p[@class='search_book_author']/span[3]/a/@title").extract_first()
item['comments'] = date.xpath(".//a[@class='search_comment_num']/text()").extract_first()
item['pubdate'] = date.re_first("(([0-9]{4})-([0-9]{2})-([0-9]{2}))")
item['description'] = date.xpath(".//p[@class='detail']/text()").extract_first()
yield item
if(self.p<20):
self.p+=1
path = response.xpath(".//li[@class='next']/a/@href").extract_first()
path = "http://search.dangdang.com{}".format(path)
request = scrapy.Request(path)
yield request

pipline

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql
import scrapy
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
from dangdang.settings import mysql_local
from dangdang.settings import IMAGES_STORE
from scrapy.pipelines.images import ImagesPipeline
from scrapy import Request
from scrapy.exceptions import DropItem
import os

class DangdangPipeline:
def open_spider(self,spider):
self.conn = pymysql.connect(**mysql_local)
self.cursor = self.conn.cursor()
sql = '''
create table if not exists booktable(
id int primary key auto_increment,
author varchar(50) ,
comments varchar(10) ,
description varchar(200) ,
name varchar(100) ,
pic varchar(50) ,
price varchar(10) ,
publisher varchar(20)
)'''
self.cursor.execute(sql)

def process_item(self, item, spider):
print("管道接收到的数据:",tuple(list(item.values())))
print("名字:",item['name'])
sql = '''
insert into booktable(author,comments,description,name,pic,price,publisher)
values (%s,%s,%s,%s,%s,%s,%s)
'''

self.cursor.execute(sql,(
item['author'], item['comments'], item['description'], item['name'], item['pic']
, item['price'], item['publisher']
))
self.conn.commit()
return item
def close_spider(self,spider):
self.cursor.close()
self.conn.close()


class MyImagesPipeline(ImagesPipeline):

def get_media_requests(self, item, info):
# 提取我们在item中封装的链接,生成request,下载对应的数据
# 这里Request的回调由框架进行处理
path = item['pic']
path = "https:{}".format(path)
yield Request(url=path)

# 下载完后,对文件重命名,否则默认文件名为hash码的形式
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem("Item contains no images")
os.rename(IMAGES_STORE + '/' + image_paths[0],IMAGES_STORE+"/full/"+item["name"]+".jpg")
return item



item

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
import scrapy


class DangdangItem(scrapy.Item):
'''
当当图书的Item 类,包含所有要爬取的信息
'''

# define the fields for your item here like:
# name = scrapy.Field()

name = scrapy.Field() # 书名
price = scrapy.Field() # 价格
pic = scrapy.Field() # 图片链接
author = scrapy.Field() # 作者
publisher = scrapy.Field() # 出版商
comments = scrapy.Field() # 评论数量
pubdate = scrapy.Field() # 发行日期
description = scrapy.Field() # 描述


setting

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
BOT_NAME = "dangdang"

SPIDER_MODULES = ["dangdang.spiders"]
NEWSPIDER_MODULE = "dangdang.spiders"
DEFAULT_REQUEST_HEADERS = {
"User-Agent":'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36',
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en",
}
ITEM_PIPELINES = {
"dangdang.pipelines.DangdangPipeline": 300,
"dangdang.pipelines.MyImagesPipeline": 500,
}
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
FEED_EXPORT_ENCODING = "utf-8"
mysql_local={
'host':'localhost',
'port':3306,
'user':'root',
'password':'123456',
'db':'pythonbook'
}
IMAGES_STORE ="./imgs"
MEDIA_ALLOW_REDIRECTS =True