forked from qntfy/trawler
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrawler.py
executable file
·83 lines (71 loc) · 3.21 KB
/
trawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
#!/usr/bin/env python
"""
This script downloads Tweets for a given list of usernames.
Instantiate with -h option to view help info.
"""
# Standard Library modules
import argparse
import codecs
import os
import sys
import yaml
import datetime
import collections
import pprint
# Third party modules
from twython import Twython, TwythonError
# Local modules
from twitter_crawler import (get_connection, save_tweets_to_json_file,
get_screen_names_from_file, get_timeline_crawler,
get_console_info_logger)
def main():
# Make stdout output UTF-8, preventing "'ascii' codec can't encode" errors
sys.stdout = codecs.getwriter('utf8')(sys.stdout)
# Parse and document command line options
parser = argparse.ArgumentParser(description="")
parser.add_argument('--input', dest='screen_name_file', default="example_screen_names.txt",
help='A text file with one screen name per line.')
parser.add_argument('--token', dest='token_file', default=os.path.expanduser("~") + "/.trawler/default.yaml",
help='A configuration file with Twitter API access tokens. See example_token_file.yaml or twitter_oauth_settings.sample.py')
parser.add_argument('--output', dest='output', default='./',
help='Where to output the resulting data.')
args = parser.parse_args()
# Set up loggers and output directory
logger = get_console_info_logger()
output_directory = args.output
try:
if not os.path.exists(output_directory):
os.makedirs(output_directory)
except:
print "Could not create directory:", directory
exit(0)
logger.info("Created directory: %s" % output_directory)
# Set up API access
if args.token_file.endswith('yaml'):
#YAML file
tokens = yaml.safe_load(open(args.token_file))
elif args.token_file.endswith('py'):
#.py file -- surely there is a better way to do this
tokens = {}
for line in open(args.token_file):
k,v = [x.strip() for x in line.split("=")]
tokens[k] = v[1:-1]
else:
raise "Unrecognized token file type -- please use a .yaml or .py file following the examples"
twython = get_connection( tokens['consumer_key'], tokens['consumer_secret'])
crawler = get_timeline_crawler( twython, logger=logger)
# Gather unique screen names
screen_names = get_screen_names_from_file(args.screen_name_file)
# Gather tweets for each of the unique screen names
# NB: in production, one should use `id` as an identifier (which does not change)
# rather than the `screen_name`, which can be changed at the users's whim.
for screen_name in screen_names:
tweet_filename = output_directory + screen_name + ".tweets.gz"
if os.path.exists(tweet_filename):
logger.info("File '%s' already exists - will not attempt to download Tweets for '%s'" % (tweet_filename, screen_name))
else:
tweets = crawler.get_all_timeline_tweets_for_screen_name( screen_name )
#Write them out as one-JSON-object-per-line in a gzipped file
save_tweets_to_json_file(tweets, tweet_filename)
if __name__ == "__main__":
main()