Scrapy: Scraping Web Sites with Python – Tutorial

Scrapy is a very powerful framework that enables you to crawl web pages and extract information you need.

Installation

pip install scrapy

Start the project

To start a project go into the folder you would like the project to reside in and type:

scrapy startproject tutorial

This will generate the initial files:

Create crawler

Here is a sample crawler that extracts links:

import scrapy

class Item(scrapy.Item):
    link = scrapy.Field()

class BlogSpider(scrapy.Spider):
    name = 'getabc123'
    allowed_domains = ['kulturnicenterq.org/lgbtqslovar']
    start_urls = ['https://www.kulturnicenterq.org/lgbtqslovar/']

    def parse(self, response):
        for li in response.xpath('//*[@id="header"]/nav/ul/li'):
            link = li.css("li a::attr(href)").extract_first()

            yield scrapy.Request(
                url=link,
                callback=self.parse_subpage,
                dont_filter=True
            )

    def parse_subpage(self, response):
        # Add the first page to the list
        yield Item(link=response.url)

        for nav in response.xpath('//a[contains(@class, "page")]'):
            link = nav.css("a::attr(href)").extract_first()
            # Add subpages to the list
            yield Item(link=link)

And a more COMPLEX one that parses a couple of pages before it gets to the data we need:


import re
import scrapy


class Item(scrapy.Item):
    term = scrapy.Field()
    origin = scrapy.Field()
    synonyms = scrapy.Field()
    meaning = scrapy.Field()
    examples = scrapy.Field()
    importance = scrapy.Field()
    new = scrapy.Field()


class Pages(scrapy.Spider):
    name = 'allinone'
    allowed_domains = ['kulturnicenterq.org/lgbtqslovar']
    start_urls = ['https://www.kulturnicenterq.org/lgbtqslovar/']

    def parse(self, response):
        """ Get links from a-z navigation in the header
        """
        for li in response.xpath('//*[@id="header"]/nav/ul/li'):
            link = li.css("li a::attr(href)").extract_first()

            # Check a-z pages for any subpages
            yield scrapy.Request(
                url=link,
                callback=self.parse_a_z,
                dont_filter=True
            )

    def parse_a_z(self, response):
        """ Parse a-z pages and get subpages eg. (/page/2/...)
        """
        links = []
        # Add the page 1
        links.append(response.url)

        # Check pagination if there are any additional pages 2, 3, 4 ...
        for pagination in response.xpath('//a[contains(@class, "page")]'):
            link = pagination.css("a::attr(href)").extract_first()
            # Add subpage to the list
            links.append(link)

        # Go through all subpages links to get links of terms
        for link in links:
            yield scrapy.Request(
                url=link,
                callback=self.parse_subpages_for_terms,
                dont_filter=True
            )

    def parse_subpages_for_terms(self, response):
        """ Get term page link
        """
        for item in response.xpath('//*[@id="middle"]/div/div[1]/h1/a'):
            link = item.css("a::attr(href)").extract_first()

            # Go to all terms pages to get data
            yield scrapy.Request(
                url=link,
                callback=self.parse_term,
                dont_filter=True
            )

    def parse_term(self, response):
        """ Parse term page to get all info
        """
        TITLE_XPATH = '//*[@id="middle"]/div/div[1]/h1/a'
        NEW_XPATH = '//h1[contains(@class, "novo")]'
        ORIGIN_XPATH = '//*[@id="content"]/p/text()'
        SYNONYMS_XPATH = '//*[@id="related-posts-MRP_all"]/ul/li/a/text()'
        DESCRIPTIONS_XPATH = '//*[@id="content"]/ol/li/p/text()'
        DESCRIPTION_XPATH = '//*[@id="content"]/ol/li[{}]/p/text()'
        EXAMPLES_XPATH = '//*[@id="content"]/ul/li/p/text()'
        EXAMPLE_XPATH = '//*[@id="content"]/ul/li[{}]/p/text()'

        for item in response.xpath('//*[@id="middle"]/div/div[1]'):
            # Get term
            a = item.xpath(TITLE_XPATH)
            title = a.css("a::text").extract_first()
            term = title.strip()

            # Get the new status - new word
            new = item.xpath(NEW_XPATH).extract()
            new = True if new else False

            # Get origin - ang., gr., kratica
            origin = item.xpath(ORIGIN_XPATH).extract()
            if (len(origin) > 0):
                origin = origin[0]

            # Get synonyms
            synonyms = item.xpath(SYNONYMS_XPATH).extract()

            # Get descriptions
            descriptions = item.xpath(DESCRIPTIONS_XPATH).extract()
            descriptions_count = len(descriptions)

            # Parse descriptions and examples
            while (descriptions_count > 0):
                # Get description
                description = item.xpath(
                    DESCRIPTION_XPATH.format(descriptions_count)).extract()
                if (description):
                    meaning = description[0]
                else:
                    meaning = []

                # Get example
                examples = item.xpath(
                    EXAMPLE_XPATH.format(descriptions_count)).extract()

                # Remove default copy if there are no examples
                if (examples and examples[0] == 'Primerov \u0161e ni.'):
                    examples[0] = ''

                # Remove: 1| 2| 3| at the beginning of examples
                if (examples):
                    found = re.match(r'^\d+\|\s', examples[0])
                    if (found):
                        examples[0] = examples[0][len(found.group()):]

                descriptions_count -= 1

                # Save the data
                yield Item(term=term, new=new, origin=origin, synonyms=synonyms, meaning=meaning,
                           examples=examples, importance=descriptions_count + 1)

List all available crawlers

scrapy list

Run crawler

scrapy crawl quotes
# output csv
scrapy crawl dictionary -o dictionary.csv
# output json
scrapy crawl dictionary -o dictionary.json

Scrapy shell

Scrapy shell is a great tool for discovery. You can play with it to see the data it returns before you use it in the script.

scrapy shell "https://www.cnn.com"

response.css('title')
response.css('title').extract()
response.css('title::text').extract()  # list
response.css('title::text')[0].extract()  # string
response.css('title::text').extract_first() # string

# CSS
response.css('title::text').re(r'Names.*')
['Names to Remember']
response.css('title::text').re(r'N\w+')
['Names']
response.css('title::text').re(r'(\w+) to (\w+)')
['Names', 'Remember']
quote.css("span.text::text").extract_first()
quote.css("div.tags a.tag::text").extract()

# XPATH
response.xpath('//title')
[<Selector xpath='//title' data='<title>Names to Remember</title>'>]
response.xpath('//title/text()').extract_first()
'Names to Remember'

Official tutorial

22nds

Livecoding and building feminist internet.