blob: 973f022d7d2cdd092942e3531a74b4fef6c35026 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
|
# -*- coding: utf-8 -*-
import scrapy
from ..items import BlogPostItem
class BlogItemsSpider(scrapy.Spider):
name = 'blogitems'
allowed_domains = ['codyhiar.com']
start_urls = ['http://codyhiar.com/']
def __init__(self, tag=None, *args, **kwargs):
super().__init__(*args, **kwargs)
self.start_urls = ['https://www.codyhiar.com/tags/{}'.format(tag)]
def parse(self, response):
# from scrapy.shell import inspect_response
# inspect_response(response, self)
year_archives = response.css('.archive')
for year_archive in year_archives:
year = year_archive.css('h3::text').extract_first()
posts = year_archive.css('.post-item')
for post in posts:
title = post.css('a::text').extract_first().strip()
date = post.css('.post-time::text').extract_first()
blog_post = BlogPostItem(
title=title,
date=date,
year=year
)
yield blog_post
|