scrapy 练手案例

spider

import scrapy,re
from ..items import DangdangItem

class PythonbookSpider(scrapy.Spider):
    name = "pythonbook"
    start_urls = ["http://search.dangdang.com/?key=python&act=input"]
    p = 1
    def parse(self, response):
        disc = response.xpath(".//ul[@class='bigimg']/li")
        item = DangdangItem()
        for date in disc:
            item['name'] = date.xpath("./a/@title").extract_first()
            price = date.xpath(".//span[@class='search_now_price']").extract_first()
            price = re.findall(".*?([0-9]*\.[0-9]*)",price)
            if(price[0]):
                item['price'] = price[0]
            else:
                item['price'] = None
            item['pic'] = date.xpath(".//img/@data-original|.//img/@src").extract_first()
            item['author'] = date.xpath(".//p[@class='search_book_author']/span[1]/a[1]/@title").extract_first()
            item['publisher'] = date.xpath(".//p[@class='search_book_author']/span[3]/a/@title").extract_first()
            item['comments'] = date.xpath(".//a[@class='search_comment_num']/text()").extract_first()
            item['pubdate'] = date.re_first("(([0-9]{4})-([0-9]{2})-([0-9]{2}))")
            item['description'] = date.xpath(".//p[@class='detail']/text()").extract_first()
            yield item
        if(self.p<20):
            self.p+=1
            path = response.xpath(".//li[@class='next']/a/@href").extract_first()
            path = "http://search.dangdang.com{}".format(path)
            request = scrapy.Request(path)
            yield request

pipline


# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql
import scrapy
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
from dangdang.settings import mysql_local
from dangdang.settings import IMAGES_STORE
from scrapy.pipelines.images import ImagesPipeline
from scrapy import Request
from scrapy.exceptions import DropItem
import os

class DangdangPipeline:
    def open_spider(self,spider):
        self.conn = pymysql.connect(**mysql_local)
        self.cursor  = self.conn.cursor()
        sql = '''
        create table if not exists booktable(
            id int primary key auto_increment,
            author varchar(50) ,
            comments varchar(10) ,
            description varchar(200) ,
            name varchar(100) ,
            pic varchar(50) ,
            price varchar(10) ,
            publisher varchar(20) 
)'''
        self.cursor.execute(sql)

    def process_item(self, item, spider):
        print("管道接收到的数据:",tuple(list(item.values())))
        print("名字:",item['name'])
        sql = '''
        insert into booktable(author,comments,description,name,pic,price,publisher) 
        values (%s,%s,%s,%s,%s,%s,%s)
        '''

        self.cursor.execute(sql,(
            item['author'], item['comments'], item['description'], item['name'], item['pic']
            , item['price'], item['publisher']
        ))
        self.conn.commit()
        return item
    def close_spider(self,spider):
        self.cursor.close()
        self.conn.close()


class MyImagesPipeline(ImagesPipeline):

    def get_media_requests(self, item, info):
        # 提取我们在item中封装的链接，生成request，下载对应的数据
        # 这里Request的回调由框架进行处理
        path = item['pic']
        path = "https:{}".format(path)
        yield Request(url=path)

    # 下载完后，对文件重命名，否则默认文件名为hash码的形式
    def item_completed(self, results, item, info):
        image_paths = [x['path'] for ok, x in results if ok]
        if not image_paths:
            raise DropItem("Item contains no images")
        os.rename(IMAGES_STORE + '/' + image_paths[0],IMAGES_STORE+"/full/"+item["name"]+".jpg")
        return item

item

import scrapy


class DangdangItem(scrapy.Item):
    '''
      当当图书的Item 类，包含所有要爬取的信息
    '''

    # define the fields for your item here like:
    # name = scrapy.Field()

    name = scrapy.Field()  # 书名
    price = scrapy.Field()  # 价格
    pic = scrapy.Field()  # 图片链接
    author = scrapy.Field()  # 作者
    publisher = scrapy.Field()  # 出版商
    comments = scrapy.Field()  # 评论数量
    pubdate = scrapy.Field()  # 发行日期
    description = scrapy.Field()  # 描述

setting

BOT_NAME = "dangdang"

SPIDER_MODULES = ["dangdang.spiders"]
NEWSPIDER_MODULE = "dangdang.spiders"
DEFAULT_REQUEST_HEADERS = {
    "User-Agent":'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36',
   "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
   "Accept-Language": "en",
}
ITEM_PIPELINES = {
   "dangdang.pipelines.DangdangPipeline": 300,
   "dangdang.pipelines.MyImagesPipeline": 500,
}
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
FEED_EXPORT_ENCODING = "utf-8"
mysql_local={
   'host':'localhost',
   'port':3306,
   'user':'root',
   'password':'123456',
   'db':'pythonbook'
}
IMAGES_STORE ="./imgs"
MEDIA_ALLOW_REDIRECTS =True