-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathyiff_image_scraper.py
493 lines (430 loc) · 20 KB
/
yiff_image_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
from bs4 import BeautifulSoup as bs
import requests
import re
import sys
import os
import platform as pf
amountOfLinks = len(sys.argv)-1
urlCounter = 0
imageCounter = 0
skippedCounter = 0
urlList = []
missingFiles = []
downloadedFiles = []
dlFileList = []
userAgent = "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36"
dirSep = ""
system = pf.system()
cLastPageFlag = False
if(system == 'Windows'):
dirSep = "\\"
else:
dirSep = "/"
print("Please input a path to save your Images in.\nLeave blank for the default path.")
cPath = input().strip()
if cPath is None:
cPath = '.' + dirSep
elif os.path.isfile(cPath):
print("The chosen path leads to a file not a folder")
quit()
elif os.path.isdir(cPath) and cPath[-1] != dirSep:
cPath += dirSep
print("\n======Starting Scraper========")
for n in range(amountOfLinks):
urlList.append(sys.argv[n+1])
try:
startPage = int(sys.argv[1])-1
urlList.pop(0)
amountOfLinks -= 1
except:
startPage = 0
try:
cLastPage = int(sys.argv[2])
cLastPageFlag = True
urlList.pop(0)
amountOfLinks -= 1
if cLastPage < startPage:
sys.exit()
except SystemExit:
sys.exit("Please choose a lower starting page. Your current pagenumbers are: Starting Page: " + (startPage) + ", Last Page: " + str(cLastPage))
except:
pass
# Check the arguments for the "-folders" flag. If present, remove it, decrement amountOfLinks, and set useFolders flag
try:
if ('-folders' in urlList):
print("Sub folders will be created.\n")
useFolders = True
urlList.remove('-folders')
amountOfLinks -= 1
else:
useFolders = False
except:
useFolders = False
#Checks if there are any links present
if amountOfLinks <= 0:
print("\nPlease enter at least 1 link as argument.\ne.g. https://yiff.party/patreon/1\n")
print("============0/0===============\n")
sys.exit()
#Creates Image Directory
if not os.path.isdir(cPath +"Images"+ dirSep +""):
os.mkdir(cPath +"Images"+ dirSep +"")
#Creates Database Directory
if not os.path.isdir(cPath +"DB"+ dirSep +""):
os.mkdir(cPath +"DB"+ dirSep +"")
def getFlag():
return cLastPageFlag
def setFlag(boolean):
cLastPageFlag = boolean
def sanitiseFolderName(rawFolderName):
#First remove all characters that are not alphanumerics or in this list: '_- #!(),.$+
cleanedFolderName = "".join(x for x in rawFolderName if(x.isalnum() or x in "'_- #!(),.$+"))
#Then let's remove any preceding or trailing spaces, periods, commas
cleanedFolderName = cleanedFolderName.strip(' .,')
#If those steps have trimmed the name down to no characters, add a placeholder
if (len(cleanedFolderName) < 1):
cleanedFolderName = "NA" #+ cleanFolderName <- This seems unnecessary
return cleanedFolderName
def accountForDuplicates(aDict):
counter = 0
bList = []
cList = []
newDict = {}
aDict = sorted(aDict.items(), key=lambda item: item[1])
#print(aDict)
for i1 in range(len(aDict)):
#print(aDict[i1][1])
bList.append(aDict[i1][1])
for i2 in range(len(aDict)):
cList.append(aDict[i2][0])
bList.append("buffer")
cList.append("buffer")
for h in range(len(bList)-1):
if bList[h] == bList[h+1]:
#print(bList[h])
#updatedItem = {cList[h]:}
newDict[cList[h]] = (str(counter) + " " + bList[h])
counter += 1
else:
newDict[cList[h]] = bList[h]
return newDict
def makeConformUrl(aList):
for k in range(len(aList)-1):
if(str(aList[k]).startswith("/")):
aList[k] = "https://yiff.party" + str(aList[k])
return aList
def downloader(myUrl, myImageName, myGalleryAuthor, postFolderName): #recursively tries to download the images - in the case of the site not accepting anymore requests
global imageCounter
global skippedCounter
global downloadedFiles
global dlFileList
try:
r = requests.get(myUrl, headers = {'User-Agent': userAgent}, timeout=(30,30), stream=True)
if r.status_code == 200:
#If we were passed a valid folder name, use it to make a folder for the post
if (postFolderName != False):
# If the file doesn't already exist, download it!
#if not os.path.isfile(cPath + "Images" + dirSep + myGalleryAuthor + dirSep + postFolderName+ dirSep + myImageName):
if not myImageName in dlFileList:
# If the folder does not already exist, make it!
if not os.path.isdir(cPath + "Images" + dirSep + myGalleryAuthor + dirSep + postFolderName + dirSep + ""):
os.mkdir(cPath + "Images" + dirSep + myGalleryAuthor + dirSep + postFolderName+ dirSep + "")
with open(cPath + "Images" + dirSep + myGalleryAuthor + dirSep + postFolderName+ dirSep + myImageName, 'wb') as f:
for chunk in r:
f.write(chunk)
imageCounter += 1
downloadedFiles.append(myImageName)
else:
print(">Skipped, already exists!")
skippedCounter += 1
#IF we were passed 'FALSE' instead of a folder name, do not create a folder, but simply save in Author page
else:
# If the file doesn't already exist, download it!
#if not os.path.isfile(cPath + "Images" + dirSep + myGalleryAuthor + dirSep + myImageName):
if not myImageName in dlFileList:
with open(cPath + "Images" + dirSep + myGalleryAuthor + dirSep + myImageName, 'wb') as f:
for chunk in r:
f.write(chunk)
imageCounter += 1
downloadedFiles.append(myImageName)
else:
print(">Skipped, already exists!")
skippedCounter += 1
else:
# If we get a bad response, let the user know what it was
print(">Skipped ["+myUrl+"]:\n>"+"(Error: Bad Response- " + str(r.status_code) + ")")
except Exception as errorCode:
#If we failed out of the download entirely, show the user the exception code
print(">Skipped ["+myUrl+"]:\n>"+"(Error: " + str(errorCode) + ")")
missingFiles.append(myUrl)
return
#short function to get the video link from a link with embedded video like https://yiff.party/vimeo/1
def getEmbeddedVideos(url): #Only tested with Vimeo Videos so far and also not working
#print("embed found with url " + url)
url = "https://yiff.party/vimeo_embed?v=" + str(url).split("/")[-1]
response = requests.get(url, headers = {'User-Agent': userAgent})
regex = r'("url":"[^,]*\.mp4",)'
tempLink = str(re.findall(regex, response.text)[0])
link = tempLink.split("\"")[3]
return link
def fantiaSubroutine(postList):
linklist = []
for postUrl in postList:
response = requests.get(postUrl, headers = {'User-Agent': userAgent})
soup = bs(response.text, "html.parser")
try:
var = soup.find('div', {'class':'col s12 l9'})
linklist.append(var.a['href'])
var2 = var.find_all('div', {'class':'yp-post-content'})
for img in var2:
try:
linklist.append(img.a['href'])
continue
except:
pass
try:
imglist = img.find_all('div', {'class': 'ccol s12 m6'})
for img in imglist:
linklist.append(img.a['src'])
except:
pass
except TypeError:
pass
return linklist
def getGalleryName(gallUrl, gallNum):
try:
response = requests.get(gallUrl, headers = {'User-Agent': userAgent})
soup = bs(response.text, "html.parser")
name = soup.find('title').text.split("|")[0]
return name.strip()
except:
print("Problem getting the authors name. Using the authors number instead.")
return gallNum
def downloadImages(url, urlCounter, useFolders):
imageNameDict = {}
postDateTitleDict = {}
postNumberDict = {}
linkList = []
imgContainerUrls = []
embeddedVideos = []
global imageCounter
imageCounter = 0
global downloadedFiles
downloadedFiles.clear()
global dlFileList
#Gets the Gallery Author's number. Fails if link is shorter than https://yiff.party/patreon/1.
#Also Creates a directory for the images.
try:
galleryNumber = url.split("/")[4]
platform = url.split("/")[3]
galleryAuthor = platform + "_" + getGalleryName(url, galleryNumber)
except IndexError:
print("\nThe given url might not be valid.\nSkipping url: " + url + "\n")
print("============" + str(urlCounter) + "/" + str(amountOfLinks) + "===============\n")
return
else:
if not os.path.isdir(cPath + "Images" + dirSep + galleryAuthor + dirSep):
os.mkdir(cPath + "Images" + dirSep + galleryAuthor + dirSep)
#Gets the page and converts/reads it.
response = requests.get(url, headers = {'User-Agent': userAgent})
soup = bs(response.text, "html.parser")
newUrl = "https://yiff.party/render_posts?s=" + platform + "&c=" + galleryNumber + "&p="
#searches for the highest page number
lastPage = soup.find_all('a', {'class':'btn pag-btn'})
try:
lastPage = int(lastPage[1]["data-pag"])
cLPFlag = getFlag()
if cLPFlag:
if cLastPage > lastPage:
sys.exit()
lastPage = cLastPage
startPage = startPage
setFlag(False)
else:
startPage = 0
for i in range(startPage, lastPage):
imgContainerUrls.append(newUrl + str(i+1)) #appends the page number to the url
except SystemExit:
sys.exit("Last Page Number is too high. Please choose a number lower or equal than: " + str(lastPage))
except:
lastPage = 1
imgContainerUrls.append(newUrl + str(1))
potOfAllSoup = ""
for containerUrl in imgContainerUrls:
response = requests.get(containerUrl, headers = {'User-Agent': userAgent})
soup = bs(response.text, "html.parser")
potOfAllSoup = potOfAllSoup + response.text
if platform == 'fantia':
fantiaList = []
containersFantia = soup.find_all('div', {'class': 'col s12 m6'})
for cont in containersFantia:
fantiaList.append("https://yiff.party" + cont.a['href'].strip())
linkList += fantiaSubroutine(fantiaList)
continue
containersPart1 = soup.find_all('div', {'class': 'card-action'})
containersPart2 = soup.find_all('div', {'class': 'post-body'})
containersPart3 = soup.find_all('img', {'class': 'lazyload'})
containersPart4 = soup.find_all('p', {'class': 'yp-vimeo-proxy-embed'})
containersPart5 = soup.find_all('div', {'class': 'card-attachments'})
containers = containersPart1 + containersPart2 + containersPart3 + containersPart4 + containersPart5
#Checks if there are any images and returns an error if not. Also skips the url.
try:
containers[0]
except IndexError:
page = containerUrl.split("p=")[1]
print("\nCould not find Images. The cause might be a invalid url or there just aren't any Images.")
missingFiles.append("Page " + page + " was skipped. You can retry scraping this page with: python " + sys.argv[0] + " " + page + " " + page + " urls")
#print("Skipping url: " + url + "\n")
#print("============" + str(urlCounter) + "/" + str(amountOfLinks) + "===============\n")
continue
containerCounter1 = len(containersPart1) #amount of containers with class 'card-action'
containerCounter2 = len(containersPart2) #amount of containers with class 'post-body'
containerCounter3 = len(containersPart3) #amount of containers with class 'lazyload'
containerCounter4 = len(containersPart4) #amount of containers with class 'card-embed'
i = 0
#Searches for Image-Boxes.
for container in containers:
shortLink = ""
i += 1
if i <= containerCounter1:
try:
shortLink = container.a['href']
except:
continue
elif i <= containerCounter2 and i > containerCounter1:
try:
subContainer = container.find_all('a')
for subCont in subContainer:
linkList.append(subCont['href'])
except:
continue
elif i <= containerCounter3 and i > containerCounter2:
try:
shortLink = container['data-src'].split("&w=")[0]
shortLink = "https://" + shortLink.split("ssl:")[1]
except:
continue
elif i <= containerCounter4 and i > containerCounter3:
try:
embeddedVideos.append(container.a['href'])
except:
continue
else:
try:
subContainer = container.p
subContainer = subContainer.find_all('a')
for subCont in subContainer:
linkList.append(subCont['href'])
except:
continue
linkList.append(shortLink)
linkList = makeConformUrl(sorted(linkList))
linkList = list(dict.fromkeys(linkList))
#Hardcoded way of filtering 3rdParty Links
thirdPartyLinks = []
for entity in linkList:
if not str(entity).startswith(("https://data.yiff.party", "https://yiff.party")):
thirdPartyLinks.append(entity)
linkList.remove(entity)
#print(embeddedVideos)
for videoLink in embeddedVideos: #loop to get the video link of the embedded videos
try:
linkList.append(getEmbeddedVideos(videoLink))
#removes embedded links it could find
embeddedVideos.remove(videoLink)
except:
pass
#print(embeddedVideos)
#embedded video links that couldnt be found get appended to the 3rd party textfile
thirdPartyLinks.append("\nEmbedded Video Links:")
thirdPartyLinks += embeddedVideos
#Saves the 3rdParty Links to a respective File in the folder of the author
f = open(cPath + "Images" + dirSep + galleryAuthor + dirSep + "3rdPartyLinks.txt", "w+")
for link in thirdPartyLinks:
f.write(str(link) + "\n")
f.close()
#Creates or checks for a 'db' file
if not os.path.isfile("." + dirSep + "DB" + dirSep + galleryNumber + ".txt"):
f = open("." + dirSep + "DB" + dirSep + galleryNumber + ".txt", 'w', encoding='utf-8')
f.writelines(galleryAuthor + '\n;')
f.close()
f = open("." + dirSep + "DB" + dirSep + galleryNumber + ".txt", 'r', encoding='utf-8')
dlFileList = f.read()#.split(';')[1:]
f.close()
for h in range(0, len(linkList)-1):
updatedValue = {str(h):str(linkList[h].split("/")[len(linkList[h].split("/"))-1])}
imageNameDict.update(updatedValue)
imageNameDict = accountForDuplicates(imageNameDict)
#print(len(linkList))
#print(imageNameDict)
#print(imageCounter)
#print('\n'.join(map(str, sorted(linkList))))
#quit()
if useFolders:
#Fetches appropriate DATE and TITLE for each URL in link list via Beautiful Soup
#falls back on the post number provided by yiff.party if no appropriate title+date can be found
allSoup = bs(potOfAllSoup, "html.parser")
for h in range(0, len(linkList)-1):
# Grab the post number (this is yiff.party's numbering, not patreon's)
# May fail if the URL is not a media URL, in that case use the current loop number- this URL won't be downloaded anyway
try:
postNumber = {str(h):str(linkList[h].split("/")[5])}
except:
postNumber = {str(h):str(h).zfill(8)}
try:
#Find the location in the soup where the URL in question is located
location = allSoup.find("a",href=linkList[h].replace("https://yiff.party",""))
#Search for the part of the post immediately above it that is a span with the 'post-time' class
timeStamp = location.find_previous("span","grey-text post-time").contents
trimmedTimeStamp = ''.join(timeStamp).split("T")[0]
#Search for the part of the post immediately above it that is a span with the 'card-title activator grey-text text-darken-4' class
postName = location.find_previous("span","card-title activator grey-text text-darken-4").contents
#Split out the post title and Remove any characters that would be illegal file names
CleanedPostName = sanitiseFolderName(''.join(postName[0]))
dateTitle = {str(h):(trimmedTimeStamp + " " + CleanedPostName)}
#If we can't find a nice post name and date for whatever reason, fail to using the yiff-provided post number
except:
dateTitle = postNumber
postDateTitleDict.update(dateTitle)
print("Starting download of " + str(len(linkList)-1) + " items.")
#Loops through the Image Urls and downloads them.
try:
for i in range(len(linkList)-1):
if useFolders:
postFolderName = postDateTitleDict[str(i)]
else:
postFolderName = False
imageName = imageNameDict[str(i)]
urlI = linkList[i]
print("Downloading " + imageName) #Shows the name of the current downloading image
downloader(urlI, imageName, galleryAuthor, postFolderName)
except KeyboardInterrupt:
f = open("." + dirSep + "DB" + dirSep + galleryNumber + ".txt", 'a+')
f.write(';'.join(downloadedFiles))
f.close()
missingFiles.append(linkList[i:-1])
f = open(cPath + "Images" + dirSep + galleryAuthor + dirSep + "SkippedLinks.txt", "w+")
for files in missingFiles:
f.write(str(files) + "\n")
f.close()
print("\nSuccessfully skipped " + str(len(missingFiles)) + " existing Images/Files!\n")
print("Successfully downloaded " + str(imageCounter) + " new Images/Files!\n")
print("============" + str(urlCounter) + "/" + str(amountOfLinks) + "===============\n")
quit()
f = open("." + dirSep + "DB" + dirSep + galleryNumber + ".txt", 'a+')
f.write(';'.join(downloadedFiles))
f.close()
#Just a finishing message.
if (imageCounter == 0) and (skippedCounter == 0):
print("No files downloaded, and no existing files skipped. Maybe there are no files or you messed up the order of the arguments: python " + sys.argv[0] + " [start page] [last page] urls")
else:
print("\nSuccessfully skipped " + str(skippedCounter) + " existing Images/Files!\n")
print("Successfully downloaded " + str(imageCounter) + " new Images/Files!\n")
print("============" + str(urlCounter) + "/" + str(amountOfLinks) + "===============\n")
f = open(cPath + "Images" + dirSep + galleryAuthor + dirSep + "SkippedLinks.txt", "w+")
for files in missingFiles:
f.write(str(files) + "\n")
f.close()
#Loops through all Yiff.party-Urls and downloads the images.
for url in urlList:
urlCounter += 1
downloadImages(url, urlCounter, useFolders)