1. csv文件写入
写入文件是一个非常简单的事情. 直接在pipeline中开启文件即可. 但这里要说明的是. 如果我们只在process_item中进行处理文件是不够优雅的. 总不能有一条数据就open一次吧
class CaipiaoFilePipeline:
def process_item(self, item, spider):
with open("caipiao.txt", mode="a", encoding='utf-8') as f:
# 写入文件
f.write(f"{item['qihao']}, {'_'.join(item['red_ball'])}, {'_'.join(item['blue_ball'])}\n")
return item
我们希望的是, 能不能打开一个文件, 然后就用这一个文件句柄来完成数据的保存. 答案是可以的. 我们可以在pipeline中创建两个方法, 一个是open_spider(), 另一个是close_spider(). 看名字也能明白其含义:
open_spider(), 在爬虫开始的时候执行一次
close_spider(), 在爬虫结束的时候执行一次
有了这俩货, 我们就可以很简单的去处理这个问题
class CaipiaoFilePipeline:
def open_spider(self, spider):
self.f = open("caipiao.txt", mode="a", encoding='utf-8')
def close_spider(self, spider):
if self.f:
self.f.close()
def process_item(self, item, spider):
# 写入文件
self.f.write(f"{item['qihao']}, {'_'.join(item['red_ball'])}, {'_'.join(item['blue_ball'])}\n")
return item
在爬虫开始的时候打开一个文件, 在爬虫结束的时候关闭这个文件. 满分~
对了, 别忘了设置settings
ITEM_PIPELINES = {
'caipiao.pipelines.CaipiaoFilePipeline': 300,
}
2. mysql数据库写入
有了上面的示例, 写入数据库其实也就很顺其自然了, 首先, 在open_spider中创建好数据库连接. 在close_spider中关闭链接. 在proccess_item中对数据进行保存工作.
先把mysql相关设置丢到settings里
# MYSQL配置信息
MYSQL_CONFIG = {
"host": "localhost",
"port": 3306,
"user": "root",
"password": "root",
"database": "spider",
}
from caipiao.settings import MYSQL_CONFIG as mysql
import pymysql
class CaipiaoMySQLPipeline:
def open_spider(self, spider):
self.conn = pymysql.connect(host=mysql["host"], port=mysql["port"], user=mysql["user"], password=mysql["password"], database=mysql["database"])
def close_spider(self, spider):
self.conn.close()
def process_item(self, item, spider):
# 写入文件
try:
cursor = self.conn.cursor()
sql = "insert into caipiao(qihao, red, blue) values(%s, %s, %s)"
red = ",".join(item['red_ball'])
blue = ",".join(item['blue_ball'])
cursor.execute(sql, (item['qihao'], red, blue))
self.conn.commit()
spider.logger.info(f"保存数据{item}")
except Exception as e:
self.conn.rollback()
spider.logger.error(f"保存数据库失败!", e, f"数据是: {item}") # 记录错误日志
return item
别忘了把pipeline设置一下
ITEM_PIPELINES = {
'caipiao.pipelines.CaipiaoMySQLPipeline': 301,
}
3. mongodb数据库写入
mongodb数据库写入和mysql写入如出一辙…不废话直接上代码吧
MONGO_CONFIG = {
"host": "localhost",
"port": 27017,
'has_user': True,
'user': "python_admin",
"password": "root",
"db": "python"
}
from caipiao.settings import MONGO_CONFIG as mongo
import pymongo
class CaipiaoMongoDBPipeline:
def open_spider(self, spider):
client = pymongo.MongoClient(host=mongo['host'],
port=mongo['port'])
db = client[mongo['db']]
if mongo['has_user']:
db.authenticate(mongo['user'], mongo['password'])
self.client = client
self.collection = db['caipiao']
def close_spider(self, spider):
self.client.close()
def process_item(self, item, spider):
self.collection.insert({"qihao": item['qihao'], 'red': item["red_ball"], 'blue': item['blue_ball']})
return item
ITEM_PIPELINES = {
# 三个管道可以共存~
'caipiao.pipelines.CaipiaoFilePipeline': 300,
'caipiao.pipelines.CaipiaoMySQLPipeline': 301,
'caipiao.pipelines.CaipiaoMongoDBPipeline': 302,
}