aboutsummaryrefslogtreecommitdiff
path: root/myproject/myproject/spiders/linkfinder.py
diff options
context:
space:
mode:
authorCody Hiar <codyfh@gmail.com>2018-03-19 21:30:32 -0600
committerCody Hiar <codyfh@gmail.com>2018-03-19 21:30:32 -0600
commit3f5efcb91afc2e6d013800132b92e4a6c297f662 (patch)
tree3de0b3c6804ec51d5784d788b8df450eae208f1a /myproject/myproject/spiders/linkfinder.py
Initial commit of working files
Diffstat (limited to 'myproject/myproject/spiders/linkfinder.py')
-rw-r--r--myproject/myproject/spiders/linkfinder.py18
1 files changed, 18 insertions, 0 deletions
diff --git a/myproject/myproject/spiders/linkfinder.py b/myproject/myproject/spiders/linkfinder.py
new file mode 100644
index 0000000..04c057b
--- /dev/null
+++ b/myproject/myproject/spiders/linkfinder.py
@@ -0,0 +1,18 @@
+# -*- coding: utf-8 -*-
+import scrapy
+from scrapy.spiders import CrawlSpider, Rule
+from scrapy.linkextractors import LinkExtractor
+
+
+class LinkFinderSpider(scrapy.spiders.CrawlSpider):
+ name = 'linkfinder'
+ allowed_domains = ['www.codyhiar.com']
+ start_urls = ['https://codyhiar.com/']
+
+ rules = (Rule(LinkExtractor(allow=()), process_links='print_internal_links'),)
+
+ def print_internal_links(self, links):
+ for link in links:
+ if 'codyhiar.com' in link.url:
+ print(link.url)
+ return links