Pythonクローリング＆スクレイピング［増補改訂版］ ―データ収集・解析のための実践開発ガイドーその18

第6章フレームワーク Scrapy

ページの遷移しながらクローリングするためのライブラリについて学びます。

CrawlSpider

CrawlSpider を継承して、リンクを辿るときのルールを指定してクローリングを行います。

from scrapy.linkextractors import LinkExtractor

from myproject.items import Headline


class NewCrawlSpider(CrawlSpider):
    name = 'news_crawl'
    allowed_domains = ['news.yahoo.co.jp']
    start_urls=['https://news.yahoo.co.jp/']

    # リンクを辿るときの規則
    rules = (
        Rule(LinkExtractor(allow=r'/pickup/\d+$'), callback='parse_topics'),
    )

    def parse_topics(self, response):
        item = Headline()
        item['title'] = response.css('.tpcNews_title::text').get()
        item['body'] = response.css('.tpcNews_summary').xpath('string()').get()
        print('-------------------')
        print(item['title'])
        yield item

IKEA のスクレイピング

価格の部分の抜き出しが良くわからなかったです。

from scrapy.spiders import SitemapSpider


class IkeaSpider(SitemapSpider):
    name = 'ikea'
    allowed_domains=['www.ikea.com']
    custom_settings = {
        'USER_AGENT': 'ikeabot',
    }
    # IKEAのサイトマップを指定
    sitemap_urls = ['https://www.ikea.com/robots.txt']
    # サイトマップを指定
    sitemap_follow = [
        r'prod-ja_JP',
    ]
    # サクセスのルール
    sitemap_rules = [
        (r'/products/', 'parse_product'),
    ]

    def parse_product(self, response):
        url = response.url  
        name = response.css('#name::text').get().strip()
        product_type = response.css('#type::text').get().strip()
        # 価格は &nbsp をスペースに置き換える
        price = response.css('#price1::text').re_first('[\S\xa0]+').replace('\xa0', ' ')
        
        yield {
            'url': url,
            'name': name,
            'type': product_type,
            
            'price': price,
        }