-
-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathtwitter.py
261 lines (186 loc) · 7.63 KB
/
twitter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
"""
This script has 2 functions, one is used to extract the values we need from
a Tweet JSON response.
The other one creates a Markdown text with the previous generated values and
mirrors the tweet's images to Imgur.
"""
import json
from datetime import datetime
import requests
from imgur import upload_image
HEADERS = {
"Authorization": "Bearer AAAAAAAAAAAAAAAAAAAAAPYXBAAAAAAACLXUNDekMxqa8h%2F40K4moUkGsoc%3DTYfbDKbT3jJPCEVnMYqilB28NHfOPqkca3qaAxGfsyKCs0wRbw"
}
BASE_URL = "https://api.twitter.com/1.1/"
URL_SHORTENERS = [
"bit.ly",
"ow.ly",
"tinyurl"
]
def transcribe_tweet(tweet_url, template):
"""Generates a Markdown message by filling the values into the message template.
PArameters
----------
tweet_url : str
The tweet url.
template : str
The message string template. See the template folder for details.
Returns
-------
str
The post template filled with the tweet data values.
"""
request_token()
tweet_id = tweet_url.split("/status/")[-1].split("?")[0]
final_url = BASE_URL + \
f"statuses/show.json?id={tweet_id}&tweet_mode=extended"
# We make a GET requeswt to the tweet url.
with requests.get(final_url, headers=HEADERS) as tweet_response:
# We send the HTML source of the tweet to the scrape_Tweet function.
tweet_data = scrape_tweet(tweet_response.text)
# We start taking the values from the returned dictionary and applying transformations.
tweet_date = datetime.fromtimestamp(tweet_data["timestamp"])
# By default we assume we have image links and initialize the inner links template.
image_links_text = "*****\n\n**Imágenes:**\n\n"
if len(tweet_data["images"]) > 0:
# For each link we have we will mirror it to Imgur and update our inner links template.
for index, link in enumerate(tweet_data["images"], 1):
# We upload the image to Imgur and get the new url.
imgur_url = upload_image(link)
# We update our inner template with both links (original and Imgur).
image_links_text += "[Imagen {}]({}) - [Mirror]({})\n\n".format(
index, link, imgur_url)
else:
# If we have no images we set the image_links_text to an empty string.
image_links_text = ""
# By default we assume we have video links and initialize the inner links template.
video_links_text = "*****\n\n**Video(s):**\n\n"
if len(tweet_data["videos"]) > 0:
# For each link we have we will update our inner links template.
for index, link in enumerate(tweet_data["videos"], 1):
# We update our inner template with the links.
video_links_text += "[Video {}]({})\n\n".format(index, link)
else:
# If we have no videos we set the video_links_text to an empty string.
video_links_text = ""
# By default we assume we have url links and initialize the inner links template.
url_links_text = "*****\n\n**Link(s):**\n\n"
if len(tweet_data["links"]) > 0:
# For each link we have we will update our inner links template.
for index, link in enumerate(tweet_data["links"], 1):
# Fix for urls with parenthesis.
link = link.replace("(", "\(").replace(")", "\)")
# We update our inner template with the links.
url_links_text += "[Link {}]({})\n\n".format(index, link)
else:
# If we have no links we set the video_links_text to an empty string.
url_links_text = ""
text_lines = list()
# We split the tweet text by the new line character.
for line in tweet_data["text"].split("\n"):
# If the list element is not empty we apply a custom formatting.
if len(line) > 0:
# We will add a backlash when a line starts with a hashtag to avoid making a Markdown header.
if line[0] == "#":
text_lines.append("\#" + line[1:])
else:
text_lines.append(line)
else:
text_lines.append("\n")
# We join together the tweet text to its original form but with our cleaned formatting.
# The templates can be found in the templates folder.
tweet_text = "\n".join(text_lines)
# We fill in the message template with our variables.
post_text = template.format(
tweet_data["fullname"],
tweet_data["username"],
tweet_date,
tweet_date,
tweet_data["permalink"],
tweet_text,
image_links_text,
video_links_text,
url_links_text,
tweet_data["retweets"],
tweet_data["favorites"]
)
return post_text
def scrape_tweet(data):
"""Extracts data from the tweet JSON file.
Parameters
----------
data : str
The tweet JSON string.
Returns
-------
dict
A dictionary Containing several important values.
"""
tweet = json.loads(data)
timestamp = int(datetime.strptime(
tweet["created_at"], "%a %b %d %H:%M:%S +0000 %Y").timestamp())
tweet_id = tweet["id"]
fullname = tweet["user"]["name"]
username = tweet["user"]["screen_name"]
permalink = f"https://twitter.com/{username}/status/{tweet_id}"
favorites = tweet["favorite_count"]
retweets = tweet["retweet_count"]
# We extract all the images and video links.
image_links = list()
video_links = list()
if "extended_entities" in tweet:
for item in tweet["extended_entities"]["media"]:
if item["type"] == "photo":
image_links.append(
item["media_url_https"] + "?format=jpg&name=4096x4096")
elif item["type"] == "video":
# Select the best available video quality.
bitrate = 0
video_url = ""
for video in item["video_info"]["variants"]:
if video.get("bitrate", 0) > bitrate:
bitrate = video["bitrate"]
video_url = video["url"]
video_links.append(video_url)
url_links = list()
# We look for all the links in the tweet and unshorten them.
for item in tweet["entities"]["urls"]:
link = item["expanded_url"]
for shortener in URL_SHORTENERS:
if shortener in link:
link = resolve_shortener(link)
break
url_links.append(link)
# We remove the t.co links from the tweet text.
tweet_text = tweet["full_text"].split(
"https://t.co")[0].split("http://t.co")[0].strip()
return {
"permalink": permalink,
"timestamp": timestamp,
"fullname": fullname,
"username": username,
"favorites": favorites,
"retweets": retweets,
"images": image_links,
"videos": video_links,
"links": url_links,
"text": tweet_text
}
def request_token():
"""Gets a Guest Token from the API."""
with requests.post(BASE_URL + "guest/activate.json", headers=HEADERS) as response:
guest_token = response.json()["guest_token"]
HEADERS["x-guest-token"] = guest_token
def resolve_shortener(url):
"""Gets the real url from the url-shortener service.
Parameters
----------
url : str
A shortened url.
Returns
-------
str
The real url.
"""
with requests.head(url) as response:
return response.headers["location"]