myproject/myproject/spiders/blogitems.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31

# -*- coding: utf-8 -*-
import scrapy

from ..items import BlogPostItem


class BlogItemsSpider(scrapy.Spider):
    name = 'blogitems'
    allowed_domains = ['codyhiar.com']
    start_urls = ['http://codyhiar.com/']

    def __init__(self, tag=None, *args, **kwargs):
            super().__init__(*args, **kwargs)
            self.start_urls = ['https://www.codyhiar.com/tags/{}'.format(tag)]

    def parse(self, response):
        # from scrapy.shell import inspect_response
        # inspect_response(response, self)
        year_archives = response.css('.archive')
        for year_archive in year_archives:
            year = year_archive.css('h3::text').extract_first()
            posts = year_archive.css('.post-item')
            for post in posts:
                title = post.css('a::text').extract_first().strip()
                date = post.css('.post-time::text').extract_first()
                blog_post = BlogPostItem(
                    title=title,
                    date=date,
                    year=year
                )
                yield blog_post