1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
|
import pymysql import scrapy
from itemadapter import ItemAdapter from dangdang.settings import mysql_local from dangdang.settings import IMAGES_STORE from scrapy.pipelines.images import ImagesPipeline from scrapy import Request from scrapy.exceptions import DropItem import os
class DangdangPipeline: def open_spider(self,spider): self.conn = pymysql.connect(**mysql_local) self.cursor = self.conn.cursor() sql = ''' create table if not exists booktable( id int primary key auto_increment, author varchar(50) , comments varchar(10) , description varchar(200) , name varchar(100) , pic varchar(50) , price varchar(10) , publisher varchar(20) )''' self.cursor.execute(sql)
def process_item(self, item, spider): print("管道接收到的数据:",tuple(list(item.values()))) print("名字:",item['name']) sql = ''' insert into booktable(author,comments,description,name,pic,price,publisher) values (%s,%s,%s,%s,%s,%s,%s) '''
self.cursor.execute(sql,( item['author'], item['comments'], item['description'], item['name'], item['pic'] , item['price'], item['publisher'] )) self.conn.commit() return item def close_spider(self,spider): self.cursor.close() self.conn.close()
class MyImagesPipeline(ImagesPipeline):
def get_media_requests(self, item, info): path = item['pic'] path = "https:{}".format(path) yield Request(url=path)
def item_completed(self, results, item, info): image_paths = [x['path'] for ok, x in results if ok] if not image_paths: raise DropItem("Item contains no images") os.rename(IMAGES_STORE + '/' + image_paths[0],IMAGES_STORE+"/full/"+item["name"]+".jpg") return item
|