1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92
| from itemadapter import ItemAdapter from scrapy.exceptions import DropItem from pymongo import MongoClient from datetime import datetime import ithome2.items as items import ithome2.env as env
import json
class Ithome2Pipeline: def process_item(self, item, spider): if type(item).__name__ == 'IthomeArticleItem': if item['view_count'] < 100: raise DropItem(f'[{item["title"]}] 瀏覽數小於 100')
return item
class MongoPipeline: collection_article = 'articles' collection_response = 'response'
def open_spider(self, spider): dbname = 'ithome2' user = env.MONGO_USER password = env.MONGO_PASSWORD host = 'localhost' port = 27017 MONGO_URI = f'mongodb://{user}:{password}@{host}:{port}/' self.client = MongoClient(MONGO_URI) self.db = self.client[dbname] self.file1 = open('art.json', 'w', encoding='utf-8') self.file2 = open('resp.json', 'w', encoding='utf-8') self.file1.write('[\n') self.file2.write('[\n')
def close_spider(self, spider): self.client.close() self.file1.write(']') self.file2.write(']') self.file1.close() self.file2.close()
def process_item(self, item, spider): if type(item) is items.IthomeArticleItem: doc = self.db[self.collection_article].find_one({'url': item['url']}) item['update_time'] = datetime.now()
if not doc: item['_id'] = str(self.db[self.collection_article].insert_one(dict(item)).inserted_id) else: self.db[self.collection_article].update_one( {'_id': doc['_id']}, {'$set': dict(item)} ) item['_id'] = str(doc['_id'])
values = dict(item) values['update_time'] = values['update_time'].strftime("%Y-%m-%d %H:%M:%S") line = json.dumps(values, ensure_ascii=False) + ",\n" self.file1.write(line)
if type(item) is items.IthomeReplyItem: values = dict(item) del values['_id'] values['publish_time'] = values['publish_time'].strftime("%Y-%m-%d %H:%M:%S") values['article_id'] = str(values['article_id']) line = json.dumps(values, ensure_ascii=False) + ",\n" self.file2.write(line)
document = self.db[self.collection_response].find_one(item['_id']) if not document: insert_result = self.db[self.collection_response].insert_one(dict(item)) else: del item['_id'] self.db[self.collection_response].update_one( {'_id': document['_id']}, {'$set': dict(item)}, upsert=True )
return item
|