aboutsummaryrefslogtreecommitdiff
path: root/myproject/myproject/spiders
diff options
context:
space:
mode:
Diffstat (limited to 'myproject/myproject/spiders')
-rw-r--r--myproject/myproject/spiders/__init__.py4
-rw-r--r--myproject/myproject/spiders/blogitems.py32
-rw-r--r--myproject/myproject/spiders/linkfinder.py18
-rw-r--r--myproject/myproject/spiders/shell.py13
4 files changed, 67 insertions, 0 deletions
diff --git a/myproject/myproject/spiders/__init__.py b/myproject/myproject/spiders/__init__.py
new file mode 100644
index 0000000..ebd689a
--- /dev/null
+++ b/myproject/myproject/spiders/__init__.py
@@ -0,0 +1,4 @@
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
diff --git a/myproject/myproject/spiders/blogitems.py b/myproject/myproject/spiders/blogitems.py
new file mode 100644
index 0000000..805681b
--- /dev/null
+++ b/myproject/myproject/spiders/blogitems.py
@@ -0,0 +1,32 @@
+# -*- coding: utf-8 -*-
+import scrapy
+
+from ..items import BlogPostItem
+
+
+class BlogItemsSpider(scrapy.Spider):
+ name = 'blogitems'
+ allowed_domains = ['codyhiar.com']
+ start_urls = ['http://codyhiar.com/']
+
+ def __init__(self, tag=None, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ self.start_urls = ['https://www.codyhiar.com/tags/{}'.format(tag)]
+
+ def parse(self, response):
+ # from scrapy.shell import inspect_response
+ # inspect_response(response, self)
+ year_archives = response.css('.archive')
+ for year_archive in year_archives:
+ year = year_archive.css('h3::text').extract_first()
+ posts = year_archive.css('.post-item')
+ for post in posts:
+ title = post.css('a::text').extract_first().strip()
+ date = post.css('.post-time::text').extract_first()
+ blog_post = BlogPostItem(
+ title=title,
+ date=date,
+ year=year
+ )
+ print(blog_post)
+
diff --git a/myproject/myproject/spiders/linkfinder.py b/myproject/myproject/spiders/linkfinder.py
new file mode 100644
index 0000000..04c057b
--- /dev/null
+++ b/myproject/myproject/spiders/linkfinder.py
@@ -0,0 +1,18 @@
+# -*- coding: utf-8 -*-
+import scrapy
+from scrapy.spiders import CrawlSpider, Rule
+from scrapy.linkextractors import LinkExtractor
+
+
+class LinkFinderSpider(scrapy.spiders.CrawlSpider):
+ name = 'linkfinder'
+ allowed_domains = ['www.codyhiar.com']
+ start_urls = ['https://codyhiar.com/']
+
+ rules = (Rule(LinkExtractor(allow=()), process_links='print_internal_links'),)
+
+ def print_internal_links(self, links):
+ for link in links:
+ if 'codyhiar.com' in link.url:
+ print(link.url)
+ return links
diff --git a/myproject/myproject/spiders/shell.py b/myproject/myproject/spiders/shell.py
new file mode 100644
index 0000000..a342921
--- /dev/null
+++ b/myproject/myproject/spiders/shell.py
@@ -0,0 +1,13 @@
+# -*- coding: utf-8 -*-
+"""Invoke the scrapy shell from the spider."""
+import scrapy
+
+
+class ShellSpider(scrapy.Spider):
+ name = 'shell'
+ allowed_domains = ['codyhiar.com']
+ start_urls = ['http://codyhiar.com/']
+
+ def parse(self, response):
+ from scrapy.shell import inspect_response
+ inspect_response(response, self)