在scrapy框架中数据存储

一、手动写入到本地文件中

  • 1、方法一、直接在运行命令的时候生成文件

    scrapy crawl 爬虫名字 -o 文件名
    
  • 2、方式二、使用json直接写入

    import json
    
    class JsonPipeline(object):
        def __init__(self):
            self.file = open('blog.json', 'a', encoding='utf8')
    
        def process_item(self, item, spider):
            if spider.name == 'blog':
                self.file.write(json.dumps(dict(item), indent=2, ensure_ascii=False) + ',\n')
            return item
    
        def close_spider(self, spider):
            self.file.close()
    
  • 3、方式三、使用codecs模块写入

    import json  # 记得添加这两个库
    import codecs
    
    class ScrapytestPipeline(object):
        def __init__(self):
            self.file = codecs.open('item.json', 'wb', encoding='utf-8')
    
        def process_item(self, item, spider):
            line = json.dumps(dict(item), indent=2, ensure_ascii=False) + ',\n'
            self.file.write(line)  # 写入到文件中
            return item
    
        def close_spider(self, spider):
            if spider.name == 'xx':
                self.file.close()
    

二、使用自带模块将数据写入到本地

  • 1、使用自带的JsonItemExporter方法写入到本地文件中

    from scrapy.exporters import JsonItemExporter
    
    class JsonPipeline(object):
        def __init__(self):
            self.file = open('blog.json', 'wb')
            self.exporter = JsonItemExporter(self.file, ensure_ascii=False, encoding='utf8')
            self.exporter.start_exporting()
    
        def process_item(self, item, spider):
            if spider.name == 'blog':
                self.exporter.export_item(item)
            return item
    
        def close_spider(self, spider):
            self.exporter.finish_exporting()
            self.file.close()
    
  • 2、使用自带的JsonLinesItemExporter方法写入到本地文件中

    from scrapy.exporters import JsonItemExporter
    class JsonPipeline1(object):
        def __init__(self):
            self.fp = open('blog1.json', 'wb')
            self.exporter = JsonLinesItemExporter(self.fp, ensure_ascii=False, encoding='utf8')
    
        def process_item(self, item, spider):
            self.exporter.export_item(item)
            return item
    
        def close_spider(self, spider):
            self.fp.close()
    

三、写入到mongodb数据库中(参考pipelines这章节)

四、写入到mysql数据库中

  • 1、需要根据item的字段来创建表
  • 2、在settings.py中设置mysql数据库连接信息

    MYSQL_HOST = 'localhost'
    MYSQL_DBNAME = 'py_test'  # 数据库名字,请修改
    MYSQL_USER = 'root'  # 数据库账号,请修改
    MYSQL_PASSWD = 'root'  # 数据库密码,请修改
    MYSQL_PORT = 3306  # 数据库端口,在dbhelper中使用
    
  • 3、在pipelines中写一个类

    import pymysql
    
    class XiciPipeline(object):
        def __init__(self, dbparams):
            self.connect = pymysql.connect(
                host=dbparams['host'],
                port=dbparams['port'],
                db=dbparams['db'],
                user=dbparams['user'],
                passwd=dbparams['passwd'],
                charset=dbparams['charset'],
                use_unicode=dbparams['use_unicode']
            )
            # 创建一个句柄
            self.cursor = self.connect.cursor()
    
        @classmethod
        def from_crawler(cls, crawler):
            # 读取settings中的配置
            dbparams = dict(
                host=crawler.settings.get('MYSQL_HOST'),
                db=crawler.settings.get('MYSQL_DBNAME'),
                user=crawler.settings.get('MYSQL_USER'),
                passwd=crawler.settings.get('MYSQL_PASSWD'),
                port=crawler.settings.get('MYSQL_POR'),
                charset='utf8',  # 编码要加上,否则可能出现中文乱码问题
                use_unicode=False,
            )
            return cls(dbparams)
    
        def process_item(self, item, spider):
            if spider.name == 'xici':
                # sql语句:插入数据
                sql = 'insert into xici(ip, port, speed, proxy_type, localhost) values (%s, %s, %s, %s,%s)'
                self.cursor.execute(sql, (item['ip'], item['port'], item['speed'], item['proxy_type'], item['localhost']))
                self.connect.commit()
            return item
    
        def close_spider(self, spider):
            self.connect.close()
    

results matching ""

    No results matching ""