Scrapyつかってスライドをごそっととってくる
SlideShareをスクレイピングするやつ〜
今日はもう眠いのでスパイダーのソースだけ
scrapyでやってます.
# -*- coding: utf-8 -*- import io import os import time import urllib from datetime import datetime as dt import requests import scrapy from PIL import Image from pytz import timezone from reportlab.pdfgen import canvas from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from scrapy_splash import SplashRequest from slide_scrapy.items import SlideScrapyItem class SlideshareSpider(CrawlSpider): name = 'slideshare' allowed_domains = ['www.slideshare.net'] start_urls = ['http://www.slideshare.net/'] custom_settings = { "DOWNLOAD_DELAY": 1, "FEED_EXPORT_FIELDS": [ "title", "author", "page", "path", "url" ] } # csv export now = dt.now(timezone('Asia/Tokyo')) date = now.strftime('%Y-%m-%d') jst_time = now.strftime('%Y-%m-%dT%H-%M-%S') # rules = ( # Rule(LinkExtractor(allow=r'Items/'), callback='parse_item', follow=True), # ) def parse(self, response): search_url = urllib.parse.urljoin(response.url, "search/slideshow") query = { "searchfrom": "header", "q": "iot python", "lang": "ja" } query = urllib.parse.urlencode(query) url = search_url+"?"+query print(url) yield SplashRequest(url, method="get", callback=self.after_search, args={"wait": 2, "timeout": 90}, dont_filter=True) def after_search(self, response): # とりあえず1ページ目だけ contents = response.xpath( "//div[@class='thumbnail-content ']/a/@href").extract() for content in contents: url = urllib.parse.urljoin(response.url, content) yield SplashRequest(url, method="get", callback=self.content_search, args={"wait": 2, "timeout": 90}, dont_filter=True) def content_search(self, response): item = SlideScrapyItem() max_page = int(response.xpath( "//span[@id='total-slides']/text()").extract_first()) images_url = response.xpath( "//img[@class='slide_image']/@data-full").extract() title = response.xpath( "//span[@class='j-title-breadcrumb']/text()").extract_first().strip() author = response.xpath( "//span[@itemprop='name']/text()").extract_first().strip() pdf_name = title+".pdf" output_dir = "適当なディレクトリ" item["title"] = title item["author"] = author item["page"] = max_page item["path"] = output_dir+pdf_name item["url"] = response.url pdf_cnvs = canvas.Canvas(output_dir+pdf_name) for idx, img_url in enumerate(images_url): self.logger.info("title: {0} {1}/{2}".format(title,idx+1,max_page)) image = Image.open(io.BytesIO(requests.get(img_url).content)) image.save(".temp.png") image =Image.open(".temp.png") pdf_cnvs.setPageSize((image.size[0]+10,image.size[1]+10)) pdf_cnvs.drawInlineImage(image,0,0) pdf_cnvs.showPage() time.sleep(1) os.remove(".temp.png") pdf_cnvs.save() yield item
urlから画像をPILのImageオブジェクトのままpdfにしたかったけどなんか赤色が加算?されちゃうので一回隠しファイルでファイルを生成してます〜
超カッコ悪くて草