Skip to content

Commit 936f4df

Browse files
committed
Adding the crawler code
1 parent a0ed5dc commit 936f4df

File tree

4 files changed

+246
-0
lines changed

4 files changed

+246
-0
lines changed

scrapy-spiders/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# This package will contain the spiders of your Scrapy project
2+
#
3+
# Please refer to the documentation for information on how to create and manage
4+
# your spiders.

scrapy-spiders/investopedia.py

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
from string import ascii_lowercase
2+
3+
import scrapy
4+
from scrapy.spiders import CrawlSpider
5+
from w3lib.html import remove_tags, remove_tags_with_content
6+
7+
8+
class InvestopediaSpider(CrawlSpider):
9+
name = 'investopedia'
10+
start_urls = ['http://www.investopedia.com/terms/%s/' % s for s in ascii_lowercase + '1']
11+
12+
def parse(self, response):
13+
"""
14+
Parse the response page
15+
"""
16+
url = response.url
17+
18+
# 'terms' has to be there in the URL to proceed further
19+
if 'terms' not in url:
20+
return
21+
22+
# if the url ends with '.asp', then that's a topic page
23+
if url.endswith('.asp'):
24+
return self._parse_topic_response(response)
25+
26+
# Otherwise, assume that this a list page
27+
return self._parse_topic_list(response)
28+
29+
def _parse_topic_response(self, response):
30+
"""
31+
Parses various topics
32+
e.g. www.investopedia.com/terms/o/oddlottheory.asp
33+
"""
34+
# Get the title first
35+
title = response.css('title::text').extract_first()
36+
37+
# Replace / with a space - creates issues with writing to file
38+
title = title.replace('/', ' ')
39+
40+
# Get the first div with id Content
41+
content = response.css('div#Content')[0]
42+
content = content.css('div.content-box')
43+
44+
text = ''
45+
for child in content.xpath('//p'):
46+
47+
# Get the text from this child <p></p> tag
48+
paragraph = child.extract()
49+
50+
# Remove tags including <p> and <a>
51+
paragraph = remove_tags(remove_tags_with_content(paragraph, ('script', ))).strip()
52+
53+
# Replace '&amp;' with '&'
54+
paragraph = paragraph.replace('&amp;', '&')
55+
56+
# Replace 'U.S.' with 'US':
57+
paragraph = paragraph.replace('U.S.', 'US')
58+
59+
# Some more replacements to improve the default tokenization
60+
for c in '();.,[]"\'-:/%$+@?':
61+
paragraph = paragraph.replace(c, ' {} '.format(c))
62+
63+
# Add to the file
64+
text += paragraph.lower() + '\n'
65+
66+
# Save the title and the text both
67+
filename = 'investopedia_data.txt'
68+
f = open(filename, 'a')
69+
f.write(text)
70+
f.close()
71+
72+
def _parse_topic_list(self, response):
73+
"""
74+
Parse the page with the topics listed out
75+
e.g. www.investopedia.com/terms/o/
76+
"""
77+
list_element = response.css('ol.list')
78+
79+
# Iterate through the list of topics
80+
for l in list_element.css('li'):
81+
# Extract the URL
82+
url = l.css('a::attr(href)').extract_first()
83+
84+
next_page = response.urljoin(url)
85+
yield scrapy.Request(next_page, callback=self.parse)

scrapy-spiders/qplum.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
import json
2+
import re
3+
4+
from scrapy.spiders import CrawlSpider
5+
from w3lib.html import remove_tags, remove_tags_with_content
6+
7+
8+
class QplumSpider(CrawlSpider):
9+
name = 'qplum'
10+
start_urls = ['https://www.qplum.co/articles/{}.json'.format(i) for i in range(300)]
11+
12+
def parse(self, response):
13+
"""
14+
Parse the response page
15+
"""
16+
# Skip error URLs
17+
if response.status != 200:
18+
return
19+
20+
data = json.loads(response.text)
21+
data = data['content']
22+
23+
# Remove <script>, <sup>, <math> tags with the content
24+
paragraph = remove_tags_with_content(data, which_ones=('script', 'sup', 'math', 'style'))
25+
# Remove the rest of the tags without removing the content
26+
paragraph = remove_tags(paragraph)
27+
28+
# Replace &amp; with &
29+
paragraph = paragraph.replace('&amp;', '&')
30+
# Replace &#39; with '
31+
paragraph = paragraph.replace('&#39;', "'")
32+
paragraph = paragraph.replace('&rsquo;', "'")
33+
paragraph = paragraph.replace('&ldquo;', "'")
34+
paragraph = paragraph.replace('&rdquo;', "'")
35+
# Replace &nbsp; with a space
36+
paragraph = re.sub("&.....;", ' ', paragraph)
37+
paragraph = re.sub("&....;", ' ', paragraph)
38+
39+
# Replace 'U.S.' with 'US':
40+
paragraph = paragraph.replace('U.S.', 'US')
41+
42+
# Some more replacements to improve the default tokenization
43+
for c in ['\n', '\r', '\t']:
44+
paragraph = paragraph.replace(c, ' ')
45+
for c in '();.,[]"\'-:/%$+@?':
46+
paragraph = paragraph.replace(c, ' {} '.format(c))
47+
48+
filename = 'qplum_data.txt'
49+
f = open(filename, 'a')
50+
f.write(paragraph.lower() + '\n')
51+
f.close()

scrapy-spiders/wikipedia.py

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
import scrapy
2+
from scrapy.spiders import CrawlSpider
3+
from w3lib.html import remove_tags, remove_tags_with_content
4+
5+
6+
class WikipediaSpider(CrawlSpider):
7+
name = 'wikipedia'
8+
start_urls = ['https://en.wikipedia.org/wiki/Outline_of_finance']
9+
10+
def parse(self, response):
11+
"""
12+
Parse the response page
13+
"""
14+
url = response.url
15+
16+
if url in WikipediaSpider.start_urls:
17+
return self._parse_topic_list(response)
18+
19+
else:
20+
self.parse_topic_response(response)
21+
return self._parse_links(response)
22+
23+
def parse_topic_response(self, response):
24+
"""
25+
Parse the content
26+
"""
27+
28+
# Get the title first
29+
title = response.css('title::text').extract_first()
30+
31+
# Replace / with a space - creates issues with writing to file
32+
title = title.replace('/', ' ')
33+
34+
content = response.css('div#mw-content-text')
35+
36+
# Just extract all the '<p></p>' children from this
37+
text = ''
38+
for child in content.xpath('//p'):
39+
40+
# Get the text from this child <p></p> tag
41+
paragraph = child.extract()
42+
43+
# Remove <script>, <sup>, <math> tags with the content
44+
paragraph = remove_tags_with_content(paragraph, which_ones=('script', 'sup', 'math'))
45+
# Remove the rest of the tags without removing the content
46+
paragraph = remove_tags(paragraph)
47+
48+
# Replace '&amp;' with '&'
49+
paragraph = paragraph.replace('&amp;', '&')
50+
51+
# Replace 'U.S.' with 'US':
52+
paragraph = paragraph.replace('U.S.', 'US')
53+
54+
# Some more replacements to improve the default tokenization
55+
for c in '();.,[]"\'-:/%$+@?':
56+
paragraph = paragraph.replace(c, ' {} '.format(c))
57+
58+
# Add to the file
59+
text += paragraph.lower() + '\n'
60+
61+
filename = 'wiki_data.txt'
62+
f = open(filename, 'a')
63+
f.write(text)
64+
f.close()
65+
66+
def _parse_links(self, response):
67+
"""
68+
Parses the links from the first level of pages
69+
"""
70+
content = response.css('div#mw-content-text')
71+
72+
for child in content.xpath('//p'):
73+
# Extract the URLs
74+
urls = child.css('a::attr(href)').extract()
75+
76+
for url in urls:
77+
if url is None or 'wiki' not in url:
78+
continue
79+
80+
next_page = response.urljoin(url)
81+
yield scrapy.Request(next_page, callback=self.parse_topic_response)
82+
83+
def _parse_topic_list(self, response):
84+
"""
85+
Parse various topics from the list of topics
86+
"""
87+
88+
# All of the links on this pages are in the bullet points
89+
# Therefore, extract the 'ul' tags to get the list
90+
content = response.css('div#mw-content-text')
91+
lists = content.css('ul')
92+
93+
# Iterate through each list
94+
for ul in lists:
95+
96+
# Iterate through each list item
97+
for l in ul.css('li'):
98+
# Extract the URL
99+
url = l.css('a::attr(href)').extract_first()
100+
101+
# Skip external links as well as the links to the same page (e.g. TOC)
102+
if url is None or 'wiki' not in url:
103+
continue
104+
105+
next_page = response.urljoin(url)
106+
yield scrapy.Request(next_page, callback=self.parse)

0 commit comments

Comments
 (0)