Skip to content

Commit 019b501

Browse files
author
Remy DeCausemaker
committed
Added script to pull all hfoss feeds and save to a .txt corpus
1 parent 325909a commit 019b501

File tree

1 file changed

+31
-0
lines changed

1 file changed

+31
-0
lines changed

hfossfeedpull.py

+31
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
import feedparser
2+
from bs4 import BeautifulSoup
3+
from pprint import pprint
4+
5+
feeds = [
6+
"http://blog-decause.rhcloud.com/rss.xml",
7+
"https://rasputinfoss.wordpress.com/feed/",
8+
#"http://amm4108.github.io/feeds/hfoss.atom.xml",
9+
"http://direkitteh.tumblr.com/rss/",
10+
"https://fad4470.github.io/feed/",
11+
"http://sarahbethfederman.com/blog/feed/",
12+
"http://redtwo-foss.blogspot.com/feeds/posts/default?alt=rss",
13+
"https://milistisia2.wordpress.com/feed/",
14+
"https://mellolikejello.wordpress.com/feed/",
15+
#"https://kaffys.github.io/feed",
16+
"http://fossclassjeid64.blogspot.com/feeds/posts/default?alt=rss",
17+
"https://jalfoss.wordpress.com/feed/",
18+
"http://robertholt.net/all_feed",
19+
#"https://fortnightblog.wordpress.com/feed/",
20+
]
21+
22+
with open('{}'.format('hfossallposts.txt'), "w") as g:
23+
for feed in feeds:
24+
d = feedparser.parse(feed)
25+
with open('{}'.format(d.feed.title.encode('utf8')), "w") as f:
26+
for item in d.entries:
27+
pprint(item)
28+
soup = BeautifulSoup(item.summary)
29+
contents = "\n".join(soup.stripped_strings)
30+
f.write(contents.encode('utf8'))
31+
g.write(contents.encode('utf8'))

0 commit comments

Comments
 (0)