Menu

Scrapy: Scraping Web Sites with Python – Tutorial

July 28, 2017 - HTML, http, python

Scrapy is a very powerful framework that enables you to crawl web pages and extract information you need.

Installation

pip install scrapy

Start the project

To start a project go into the folder you would like the project to reside in and type:

scrapy startproject tutorial

This will generate the initial files:
Initial files

Create crawler

Here is a sample crawler that extracts links:

import scrapy

class Item(scrapy.Item):
    link = scrapy.Field()

class BlogSpider(scrapy.Spider):
    name = 'getabc123'
    allowed_domains = ['kulturnicenterq.org/lgbtqslovar']
    start_urls = ['https://www.kulturnicenterq.org/lgbtqslovar/']

    def parse(self, response):
        for li in response.xpath('//*[@id="header"]/nav/ul/li'):
            link = li.css("li a::attr(href)").extract_first()

            yield scrapy.Request(
                url=link,
                callback=self.parse_subpage,
                dont_filter=True
            )

    def parse_subpage(self, response):
        # Add the first page to the list
        yield Item(link=response.url)

        for nav in response.xpath('//a[contains(@class, "page")]'):
            link = nav.css("a::attr(href)").extract_first()
            # Add subpages to the list
            yield Item(link=link)

And a more COMPLEX one that parses a couple of pages before it gets to the data we need:


import re import scrapy class Item(scrapy.Item): term = scrapy.Field() origin = scrapy.Field() synonyms = scrapy.Field() meaning = scrapy.Field() examples = scrapy.Field() importance = scrapy.Field() new = scrapy.Field() class Pages(scrapy.Spider): name = 'allinone' allowed_domains = ['kulturnicenterq.org/lgbtqslovar'] start_urls = ['https://www.kulturnicenterq.org/lgbtqslovar/'] def parse(self, response): """ Get links from a-z navigation in the header """ for li in response.xpath('//*[@id="header"]/nav/ul/li'): link = li.css("li a::attr(href)").extract_first() # Check a-z pages for any subpages yield scrapy.Request( url=link, callback=self.parse_a_z, dont_filter=True ) def parse_a_z(self, response): """ Parse a-z pages and get subpages eg. (/page/2/...) """ links = [] # Add the page 1 links.append(response.url) # Check pagination if there are any additional pages 2, 3, 4 ... for pagination in response.xpath('//a[contains(@class, "page")]'): link = pagination.css("a::attr(href)").extract_first() # Add subpage to the list links.append(link) # Go through all subpages links to get links of terms for link in links: yield scrapy.Request( url=link, callback=self.parse_subpages_for_terms, dont_filter=True ) def parse_subpages_for_terms(self, response): """ Get term page link """ for item in response.xpath('//*[@id="middle"]/div/div[1]/h1/a'): link = item.css("a::attr(href)").extract_first() # Go to all terms pages to get data yield scrapy.Request( url=link, callback=self.parse_term, dont_filter=True ) def parse_term(self, response): """ Parse term page to get all info """ TITLE_XPATH = '//*[@id="middle"]/div/div[1]/h1/a' NEW_XPATH = '//h1[contains(@class, "novo")]' ORIGIN_XPATH = '//*[@id="content"]/p/text()' SYNONYMS_XPATH = '//*[@id="related-posts-MRP_all"]/ul/li/a/text()' DESCRIPTIONS_XPATH = '//*[@id="content"]/ol/li/p/text()' DESCRIPTION_XPATH = '//*[@id="content"]/ol/li[{}]/p/text()' EXAMPLES_XPATH = '//*[@id="content"]/ul/li/p/text()' EXAMPLE_XPATH = '//*[@id="content"]/ul/li[{}]/p/text()' for item in response.xpath('//*[@id="middle"]/div/div[1]'): # Get term a = item.xpath(TITLE_XPATH) title = a.css("a::text").extract_first() term = title.strip() # Get the new status - new word new = item.xpath(NEW_XPATH).extract() new = True if new else False # Get origin - ang., gr., kratica origin = item.xpath(ORIGIN_XPATH).extract() if (len(origin) > 0): origin = origin[0] # Get synonyms synonyms = item.xpath(SYNONYMS_XPATH).extract() # Get descriptions descriptions = item.xpath(DESCRIPTIONS_XPATH).extract() descriptions_count = len(descriptions) # Parse descriptions and examples while (descriptions_count > 0): # Get description description = item.xpath( DESCRIPTION_XPATH.format(descriptions_count)).extract() if (description): meaning = description[0] else: meaning = [] # Get example examples = item.xpath( EXAMPLE_XPATH.format(descriptions_count)).extract() # Remove default copy if there are no examples if (examples and examples[0] == 'Primerov \u0161e ni.'): examples[0] = '' # Remove: 1| 2| 3| at the beginning of examples if (examples): found = re.match(r'^\d+\|\s', examples[0]) if (found): examples[0] = examples[0][len(found.group()):] descriptions_count -= 1 # Save the data yield Item(term=term, new=new, origin=origin, synonyms=synonyms, meaning=meaning, examples=examples, importance=descriptions_count + 1)

List all available crawlers

scrapy list

Run crawler

scrapy crawl quotes
# output csv
scrapy crawl dictionary -o dictionary.csv
# output json
scrapy crawl dictionary -o dictionary.json

Scrapy shell

Scrapy shell is a great tool for discovery. You can play with it to see the data it returns before you use it in the script.

scrapy shell "https://www.cnn.com"

response.css('title')
response.css('title').extract()
response.css('title::text').extract()  # list
response.css('title::text')[0].extract()  # string
response.css('title::text').extract_first() # string

# CSS
response.css('title::text').re(r'Names.*')
['Names to Remember']
response.css('title::text').re(r'N\w+')
['Names']
response.css('title::text').re(r'(\w+) to (\w+)')
['Names', 'Remember']
quote.css("span.text::text").extract_first()
quote.css("div.tags a.tag::text").extract()

# XPATH
response.xpath('//title')
[<Selector xpath='//title' data='<title>Names to Remember</title>'>]
response.xpath('//title/text()').extract_first()
'Names to Remember'

Official tutorial