-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathExtractAppStoreReviews.py
231 lines (180 loc) · 6.55 KB
/
ExtractAppStoreReviews.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
import csv
import json
import logging
import pprint
import requests
import time
import typing
import xmltodict
def is_error_response(http_response, seconds_to_sleep: float = 1) -> bool:
"""
Returns False if status_code is 503 (system unavailable) or 200 (success),
otherwise it will return True (failed). This function should be used
after calling the commands requests.post() and requests.get().
:param http_response:
The response object returned from requests.post or requests.get.
:param seconds_to_sleep:
The sleep time used if the status_code is 503. This is used to not
overwhelm the service since it is unavailable.
"""
if http_response.status_code == 503:
time.sleep(seconds_to_sleep)
return False
return http_response.status_code != 200
def get_xml(url) -> typing.Union[dict, None]:
"""
Returns xml response if any. Returns None if no xml found.
:param url:
The url go get the xml from.
"""
response = requests.get(url)
if is_error_response(response):
return None
xml_response = response.text
# print("response.encoding: " + str(response.encoding))
return xml_response
def get_all_reviews_available_from_XML(app_id,page=1) -> typing.List[dict]:
"""
Returns a list of dictionaries with each dictionary being one review.
:param app_id:
The app_id you are searching.
:param page:
The page id to start the loop. Once it reaches the final page + 1, the
app will return a non valid json, thus it will exit with the current
reviews.
"""
reviews: typing.List[dict] = []
while True:
url = (
f'https://itunes.apple.com/ca/rss/customerreviews/page={page}/'
f'id={app_id}/sortby=mostrecent/xml?urlDesc=/customerreviews/'
f'page={page}/id={app_id}/sortby=mostrecent/xml')
xml_response = get_xml(url)
# File_object = open(str(int(time.time())) + ".txt","a")
# File_object.write(xml_response)
# Convert the XML into dictionary
dict_doc = xmltodict.parse(xml_response)
print("page num: " + str(page))
# Break out of the loop and return the dictionary
# when we run out of "entry"
if dict_doc.get("feed").get("entry") == None:
return reviews
# Loop through each <entry>
for entry in dict_doc['feed']['entry']:
# Extract the text comment from the <content> list
for content in entry['content']:
if content['@type'] == "text":
comment = content['#text']
# Write a single line onto the dictionary
reviews += [
{
'id' : entry['id'],
'updated' : entry['updated'],
'title' : entry['title'],
'comment' : comment,
'voteSum' : entry['im:voteSum'],
'voteCount' : entry['im:voteCount'],
'rating' : entry['im:rating'],
'version' : entry['im:version'],
'name' : entry['author']['name'],
'uri' : entry['author']['uri']
}
]
page += 1
def dump_reviews_iter():
# TODO: Grab the XML from the HTTP request
File_object = open("sample.txt","r")
dict_doc = xmltodict.parse(File_object.read())
# Open up a blank CSV file
csvfile = open(str(int(time.time())) + ".csv","w")
# Initialize the CSV writer
csvwriter = csv.writer(csvfile, delimiter=',', quotechar='"', quoting= csv.QUOTE_ALL)
# Create the header row
row = [
"id",
"updated",
"title",
"comment",
"voteSum",
"voteCount",
"rating",
"version",
"name",
"uri"
]
# Write the first row
csvwriter.writerow(row)
# Loop through each <entry>
for entry in dict_doc['feed']['entry']:
# Extract the text comment from the <content> list
for content in entry['content']:
if content['@type'] == "text":
comment = content['#text']
# Create a row of data
row = [
entry['id'],
entry['updated'],
entry['title'],
comment,
entry['im:voteSum'],
entry['im:voteCount'],
entry['im:rating'],
entry['im:version'],
entry['author']['name'],
entry['author']['uri']
]
# Write the row of data into the file
csvwriter.writerow(row)
# TODO: Change these into debug lines
# The entire print dump
# print(entry['id'])
# print(entry['updated'])
# print(entry['title'])
# for content in entry['content']:
# if content['@type'] == "text":
# print(content['#text'])
# print(content['@type'])
# TODO: Figure out the purpose of this content type
# print(entry['im:contentType'])
# print(entry['im:voteSum'])
# print(entry['im:voteCount'])
# print(entry['im:rating'])
# print(entry['im:version'])
# print(entry['author']['name'])
# print(entry['author']['uri'])
def write_dict_to_csv(reviews):
# Declare header columns
csv_columns = [
"id",
"updated",
"title",
"comment",
"voteSum",
"voteCount",
"rating",
"version",
"name",
"uri"
]
# Open up a blank CSV file
csvfile = open(str(int(time.time())) + ".csv","w", encoding="utf-8-sig")
# Initialize the CSV writer
csvwriter = csv.DictWriter(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL, fieldnames=csv_columns)
# Write the first row (a.k.a the header)
csvwriter.writeheader()
# write the rest of the reviews
for review in reviews:
csvwriter.writerow(review)
# for review in reviews:
# row = [
# review['id'],
# review['updated'],
# ]
# csvwriter.writerow(row)
reviews = get_all_reviews_available_from_XML(1200050042)
write_dict_to_csv(reviews)
print(reviews)
# dump_reviews_iter()
## url https://itunes.apple.com/ca/rss/customerreviews/id=1200050042/page=1/sortby=mostrecent/xml
## https://itunes.apple.com/ca/rss/customerreviews/page=1/id=1200050042/sortby=mostrecent/xml?urlDesc=/customerreviews/page=1/id=1200050042/sortby=mostrecent/xml
## unused code