1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135
| import scrapy from scrapy_splash import SplashRequest import ppt.items as items from scrapy.loader import ItemLoader import urllib import os
class BeautySpider(scrapy.Spider): JPG = '.jpg' PNG = '.png' IMAGE_FOLDER = 'images' IMAGE_MAX = 5 name = 'beauty' allowed_domains = ['www.ptt.cc'] URL_ENTRY = 'https://www.ptt.cc/bbs/Beauty/index.html' index = 1
script_1st = ''' function main(splash, args) splash:on_request(function(request) if request.url:find('css') then request.abort() end end) splash.images_enabled = false -- need run js for click -- -- splash.js_enabled = false --
assert(splash:go(args.url)) assert(splash:wait(0.5))
local element = splash:select('.over18-button-container > button') element:mouse_click() assert(splash:wait(1))
return { cookies = splash:get_cookies(), html = splash:html(), } end '''
script_2nd = ''' function main(splash, args) splash:init_cookies(splash.args.cookies)
splash:on_request(function(request) if request.url:find('css') then request.abort() end end) splash.images_enabled = false -- need run js for click -- -- splash.js_enabled = false --
assert(splash:go(args.url)) assert(splash:wait(0.5))
assert(splash:wait(1))
return { cookies = splash:get_cookies(), html = splash:html(), } end '''
def start_requests(self): yield SplashRequest(url=self.URL_ENTRY, callback=self.parse, endpoint='execute', args={'lua_source': self.script_1st})
def parse(self, response):
self.cookies = response.data['cookies'] posts = response.xpath("//div[@class='r-ent']") for post in posts: beaudy_item = items.PptBeautyItem() beaudy_item['title'] = post.xpath(".//div[@class='title']/a/text()").get() beaudy_item['url'] = post.xpath(".//div[@class='title']/a/@href").get() beaudy_item['push_count'] = post.xpath(".//div[@class='nrec']/span/text()").get() beaudy_item['author'] = post.xpath(".//div[@class='author']/text()").get()
if beaudy_item['title']: if '公告' not in beaudy_item['title']:
yield SplashRequest(url=response.urljoin(beaudy_item['url']), callback=self.post_parse, endpoint='execute', args={'lua_source': self.script_2nd}, cookies = self.cookies )
def post_parse(self, response): self.cookies = response.data['cookies'] if self.index < self.IMAGE_MAX: title = response.xpath("(//div[@class='article-metaline']//span[@class='article-meta-value'])[2]/text()").get() lists = response.xpath("//div[@class='richcontent']") list_index = 1 for list in lists: image_url = list.xpath(".//img/@src").get() loader = ItemLoader(item=items.PptPostItem()) loader.add_value('image_urls', [image_url]) loader.add_value('index', self.index) if self.PNG in image_url: file_name = f"{title}{list_index}{self.PNG}" elif self.JPG in image_url: file_name = f"{title}{list_index}{self.JPG}" else: file_name = f"{title}{list_index}None{self.JPG}" list_index += 1
self.image_download(image_url, file_name, self.IMAGE_FOLDER) self.index += 1 yield loader.load_item()
if self.index > self.IMAGE_MAX: break
def image_download(self, url, name, folder): dir=os.path.abspath(folder) work_path=os.path.join(dir,name) urllib.request.urlretrieve(url, work_path)
|