なぽろぐ

気ままに感じたことを記事にまとめます。

Scrapyつかってスライドをごそっととってくる

SlideShareスクレイピングするやつ〜

今日はもう眠いのでスパイダーのソースだけ
scrapyでやってます.

# -*- coding: utf-8 -*-
import io
import os
import time
import urllib
from datetime import datetime as dt

import requests
import scrapy
from PIL import Image
from pytz import timezone
from reportlab.pdfgen import canvas
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy_splash import SplashRequest

from slide_scrapy.items import SlideScrapyItem


class SlideshareSpider(CrawlSpider):
    name = 'slideshare'
    allowed_domains = ['www.slideshare.net']
    start_urls = ['http://www.slideshare.net/']

    custom_settings = {
        "DOWNLOAD_DELAY": 1,
        "FEED_EXPORT_FIELDS": [
            "title",
            "author",
            "page",
            "path",
            "url"
        ]
    }

    # csv export
    now = dt.now(timezone('Asia/Tokyo'))
    date = now.strftime('%Y-%m-%d')
    jst_time = now.strftime('%Y-%m-%dT%H-%M-%S')

    # rules = (
    #     Rule(LinkExtractor(allow=r'Items/'), callback='parse_item', follow=True),
    # )

    def parse(self, response):
        search_url = urllib.parse.urljoin(response.url, "search/slideshow")
        query = {
            "searchfrom": "header",
            "q": "iot python",
            "lang": "ja"
        }
        query = urllib.parse.urlencode(query)
        url = search_url+"?"+query
        print(url)

        yield SplashRequest(url,
                            method="get",
                            callback=self.after_search,
                            args={"wait": 2,
                                  "timeout": 90},
                            dont_filter=True)

    def after_search(self, response):
        # とりあえず1ページ目だけ
        contents = response.xpath(
            "//div[@class='thumbnail-content ']/a/@href").extract()
        for content in contents:
            url = urllib.parse.urljoin(response.url, content)
            yield SplashRequest(url,
                                method="get",
                                callback=self.content_search,
                                args={"wait": 2,
                                      "timeout": 90},
                                dont_filter=True)

    def content_search(self, response):
        item = SlideScrapyItem()


        max_page = int(response.xpath(
            "//span[@id='total-slides']/text()").extract_first())
        images_url = response.xpath(
            "//img[@class='slide_image']/@data-full").extract()

        title = response.xpath(
            "//span[@class='j-title-breadcrumb']/text()").extract_first().strip()
        author = response.xpath(
            "//span[@itemprop='name']/text()").extract_first().strip()

        pdf_name = title+".pdf"
        output_dir = "適当なディレクトリ"

        item["title"] = title
        item["author"] = author
        item["page"] = max_page
        item["path"] = output_dir+pdf_name
        item["url"] = response.url

        pdf_cnvs = canvas.Canvas(output_dir+pdf_name)
        for idx, img_url in enumerate(images_url):
            self.logger.info("title: {0} {1}/{2}".format(title,idx+1,max_page))
            image = Image.open(io.BytesIO(requests.get(img_url).content))
            image.save(".temp.png")
            image =Image.open(".temp.png")
            pdf_cnvs.setPageSize((image.size[0]+10,image.size[1]+10))
            pdf_cnvs.drawInlineImage(image,0,0)
            pdf_cnvs.showPage()
            time.sleep(1)
        os.remove(".temp.png")
        pdf_cnvs.save()      
        yield item


urlから画像をPILのImageオブジェクトのままpdfにしたかったけどなんか赤色が加算?されちゃうので一回隠しファイルでファイルを生成してます〜

超カッコ悪くて草