Add sites as suggested in https://github.com/fake-name/DanbooruScraper/issues/2
This commit is contained in:
parent
0570d20d7e
commit
6b5daa3c4a
|
@ -54,11 +54,6 @@ class KonaChanFetcher(scraper.fetchBase.AbstractFetcher):
|
|||
character = characterli.find_all('a')[-1].get_text()
|
||||
characters.append(character)
|
||||
|
||||
# print("Meta:")
|
||||
# print("tags:", tags)
|
||||
# print("artists:", artists)
|
||||
# print("characters:", characters)
|
||||
|
||||
for tag in tags:
|
||||
if tag not in job.tags:
|
||||
job.tags.append(tag)
|
||||
|
|
|
@ -27,9 +27,13 @@ class DanbooruFetcher(scraper.fetchBase.AbstractFetcher):
|
|||
characterlis = tagsection.find_all('li', class_='category-4')
|
||||
artistlis = tagsection.find_all('li', class_='category-1')
|
||||
taglis = tagsection.find_all('li', class_='category-0')
|
||||
|
||||
copyrlis = tagsection.find_all('li', class_='category-3')
|
||||
|
||||
tags = []
|
||||
for copyrli in copyrlis:
|
||||
tag = copyrli.find_all('a')[-1].get_text()
|
||||
tags.append("copyright " + tag)
|
||||
|
||||
for tagli in taglis:
|
||||
tag = tagli.find('a', class_="search-tag").get_text()
|
||||
tags.append(tag)
|
||||
|
|
|
@ -24,30 +24,38 @@ class GelbooruFetcher(scraper.fetchBase.AbstractFetcher):
|
|||
|
||||
# db.session = db.Session()
|
||||
|
||||
|
||||
def extractTags(self, job, tagsection):
|
||||
|
||||
taglis = tagsection.find_all('li', class_='tag-type-general')
|
||||
characterlis = tagsection.find_all('li', class_='tag-type-character')
|
||||
specieslis = tagsection.find_all('li', class_='tag-type-species')
|
||||
copyrlis = tagsection.find_all('li', class_='tag-type-copyright')
|
||||
artistlis = tagsection.find_all('li', class_='tag-type-artist')
|
||||
taglis = tagsection.find_all('li', class_='tag-type-general')
|
||||
|
||||
|
||||
tags = []
|
||||
for tagli in taglis:
|
||||
tag = tagli.find_all('a')[1].get_text()
|
||||
tag = tagli.find_all('a')[-1].get_text()
|
||||
tags.append(tag)
|
||||
|
||||
for speciesli in specieslis:
|
||||
tag = speciesli.find_all('a')[-1].get_text()
|
||||
tags.append("species " + tag)
|
||||
|
||||
for copyrli in copyrlis:
|
||||
tag = copyrli.find_all('a')[-1].get_text()
|
||||
tags.append("copyright " + tag)
|
||||
|
||||
artists = []
|
||||
for artistli in artistlis:
|
||||
artist = artistli.find_all('a')[1].get_text()
|
||||
artist = artistli.find_all('a')[-1].get_text()
|
||||
artists.append(artist)
|
||||
|
||||
characters = []
|
||||
for characterli in characterlis:
|
||||
character = characterli.find_all('a')[1].get_text()
|
||||
character = characterli.find_all('a')[-1].get_text()
|
||||
characters.append(character)
|
||||
|
||||
|
||||
for tag in tags:
|
||||
if tag not in job.tags:
|
||||
job.tags.append(tag)
|
||||
|
@ -113,11 +121,9 @@ class GelbooruFetcher(scraper.fetchBase.AbstractFetcher):
|
|||
return img['href']
|
||||
|
||||
def extractMeta(self, job, soup):
|
||||
sidebar = soup.find('div', class_='sidebar4').find('div', class_='sidebar3')
|
||||
|
||||
tagsection = sidebar.find('ul', id='tag-sidebar')
|
||||
tagsection = soup.find('ul', id='tag-sidebar')
|
||||
assert tagsection
|
||||
infosection = sidebar.find('div', id='stats')
|
||||
infosection = soup.find('div', id='stats')
|
||||
assert infosection
|
||||
self.extractTags(job, tagsection)
|
||||
self.extractInfo(job, infosection)
|
||||
|
|
|
@ -25,23 +25,33 @@ class R34xxxFetcher(scraper.fetchBase.AbstractFetcher):
|
|||
def extractTags(self, job, tagsection):
|
||||
|
||||
characterlis = tagsection.find_all('li', class_='tag-type-character')
|
||||
specieslis = tagsection.find_all('li', class_='tag-type-species')
|
||||
copyrlis = tagsection.find_all('li', class_='tag-type-copyright')
|
||||
artistlis = tagsection.find_all('li', class_='tag-type-artist')
|
||||
taglis = tagsection.find_all('li', class_='tag-type-general')
|
||||
|
||||
|
||||
tags = []
|
||||
for tagli in taglis:
|
||||
tag = tagli.find('a').get_text()
|
||||
tag = tagli.find_all('a')[-1].get_text()
|
||||
tags.append(tag)
|
||||
|
||||
for speciesli in specieslis:
|
||||
tag = speciesli.find_all('a')[-1].get_text()
|
||||
tags.append("species " + tag)
|
||||
|
||||
for copyrli in copyrlis:
|
||||
tag = copyrli.find_all('a')[-1].get_text()
|
||||
tags.append("copyright " + tag)
|
||||
|
||||
artists = []
|
||||
for artistli in artistlis:
|
||||
artist = artistli.find('a').get_text()
|
||||
artist = artistli.find_all('a')[-1].get_text()
|
||||
artists.append(artist)
|
||||
|
||||
characters = []
|
||||
for characterli in characterlis:
|
||||
character = characterli.find('a').get_text()
|
||||
character = characterli.find_all('a')[-1].get_text()
|
||||
characters.append(character)
|
||||
|
||||
for tag in tags:
|
||||
|
|
|
@ -0,0 +1,240 @@
|
|||
|
||||
import traceback
|
||||
import urllib.error
|
||||
import urllib.parse
|
||||
import re
|
||||
import time
|
||||
import datetime
|
||||
|
||||
import sqlalchemy.exc
|
||||
import parsedatetime
|
||||
|
||||
import scraper.runstate
|
||||
import scraper.database as db
|
||||
import scraper.fetchBase
|
||||
|
||||
class TbibFetcher(scraper.fetchBase.AbstractFetcher):
|
||||
|
||||
pluginkey = 'TBIB'
|
||||
loggerpath = "Main.TBIB"
|
||||
content_count_max = 6360000
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
# db.session = db.Session()
|
||||
|
||||
|
||||
def extractTags(self, job, tagsection):
|
||||
|
||||
characterlis = tagsection.find_all('li', class_='tag-type-character')
|
||||
specieslis = tagsection.find_all('li', class_='tag-type-species')
|
||||
copyrlis = tagsection.find_all('li', class_='tag-type-copyright')
|
||||
artistlis = tagsection.find_all('li', class_='tag-type-artist')
|
||||
taglis = tagsection.find_all('li', class_='tag-type-general')
|
||||
|
||||
|
||||
tags = []
|
||||
for tagli in taglis:
|
||||
tag = tagli.find_all('a')[-1].get_text()
|
||||
tags.append(tag)
|
||||
|
||||
for speciesli in specieslis:
|
||||
tag = speciesli.find_all('a')[-1].get_text()
|
||||
tags.append("species " + tag)
|
||||
|
||||
for copyrli in copyrlis:
|
||||
tag = copyrli.find_all('a')[-1].get_text()
|
||||
tags.append("copyright " + tag)
|
||||
|
||||
artists = []
|
||||
for artistli in artistlis:
|
||||
artist = artistli.find_all('a')[-1].get_text()
|
||||
artists.append(artist)
|
||||
|
||||
characters = []
|
||||
for characterli in characterlis:
|
||||
character = characterli.find_all('a')[-1].get_text()
|
||||
characters.append(character)
|
||||
|
||||
for tag in tags:
|
||||
if tag not in job.tags:
|
||||
job.tags.append(tag)
|
||||
for artist in artists:
|
||||
if artist not in job.artist:
|
||||
job.artist.append(artist)
|
||||
for character in characters:
|
||||
if character not in job.character:
|
||||
job.character.append(character)
|
||||
|
||||
def getxy(self, instr):
|
||||
found = re.search(r"(\d+)x(\d+)", instr)
|
||||
x, y = found.groups()
|
||||
return x, y
|
||||
|
||||
def extractInfo(self, job, infosection):
|
||||
|
||||
for li in infosection.find_all("li"):
|
||||
rawt = li.get_text().strip()
|
||||
if not rawt:
|
||||
continue
|
||||
if not ":" in rawt:
|
||||
print("rawt: '{}'".format(rawt))
|
||||
|
||||
name, val = rawt.split(":", 1)
|
||||
|
||||
name = name.strip()
|
||||
val = val.strip()
|
||||
|
||||
if name == 'Rating':
|
||||
job.rating = val
|
||||
elif name == 'Favorites':
|
||||
job.favorites = val
|
||||
elif name == 'Score':
|
||||
job.score = val.split()[0]
|
||||
elif name == 'Posted':
|
||||
cal = parsedatetime.Calendar()
|
||||
itemdate = val.split("at")[0]
|
||||
itemdate = itemdate.split("by")[0]
|
||||
print("itemdate", itemdate)
|
||||
tstruct, pstat = cal.parse(itemdate)
|
||||
print("Ret: ", pstat, tstruct)
|
||||
assert pstat == 1 or pstat == 2 or pstat == 3
|
||||
job.posted = datetime.datetime.fromtimestamp(time.mktime(tstruct))
|
||||
elif name == 'Size':
|
||||
job.imgx, job.imgy = self.getxy(val)
|
||||
|
||||
|
||||
elif name == 'Status':
|
||||
job.status = val
|
||||
# Do not try to fetch things that are banned (e.g. removed)
|
||||
if val == 'Banned':
|
||||
job.state = 'removed'
|
||||
job.err_str = 'item banned'
|
||||
elif name in ['Approver', 'Id', 'Source', 'Uploader']:
|
||||
pass
|
||||
else:
|
||||
self.log.warning("Unknown item key-value:")
|
||||
self.log.warning(" '{}' -> '{}'".format(name, val))
|
||||
|
||||
def getImageUrl(self, soup):
|
||||
img = soup.find('a', text='Original image')
|
||||
return img['href']
|
||||
|
||||
def extractMeta(self, job, soup):
|
||||
sidebar = soup.find('div', class_='sidebar')
|
||||
|
||||
tagsection = sidebar.find('ul', id='tag-sidebar')
|
||||
assert tagsection
|
||||
infosection = sidebar.find('div', id='stats')
|
||||
assert infosection
|
||||
self.extractTags(job, tagsection)
|
||||
self.extractInfo(job, infosection)
|
||||
imgurl = self.getImageUrl(soup)
|
||||
|
||||
return imgurl
|
||||
|
||||
|
||||
|
||||
def fetchImage(self, job, url, srcurl):
|
||||
url = urllib.parse.urljoin(srcurl, url)
|
||||
fname = url.split("/")[-1]
|
||||
|
||||
|
||||
cont = self.wg.getpage(url, addlHeaders={'Referer':srcurl})
|
||||
|
||||
fpath = self.saveFileRow(job, fname, cont)
|
||||
self.log.info("Saved file to path: '%s'", fpath)
|
||||
|
||||
job.filename = fname
|
||||
job.filepath = fpath
|
||||
job.state = 'complete'
|
||||
db.session.commit()
|
||||
# print(fname)
|
||||
|
||||
def processJob(self, job):
|
||||
pageurl = 'http://tbib.org/index.php?page=post&s=view&id={}'.format(job.postid)
|
||||
while 1:
|
||||
try:
|
||||
soup = self.wg.getSoup(pageurl)
|
||||
if 'You are viewing an advertisement' in soup.get_text():
|
||||
self.log.warning("Working around advertisement. Sleeping 10 seconds")
|
||||
time.sleep(13)
|
||||
else:
|
||||
break
|
||||
except urllib.error.URLError:
|
||||
job.state = 'error'
|
||||
job.err_str = 'failure fetching container page'
|
||||
db.session.commit()
|
||||
return
|
||||
|
||||
if 'Gelbooru - Image List' in soup.title.get_text():
|
||||
self.log.warning("Image has been removed.")
|
||||
job.state = 'removed'
|
||||
job.err_str = 'image has been removed'
|
||||
db.session.commit()
|
||||
return
|
||||
|
||||
if 'This post was deleted. Reason: Duplicate of' in soup.get_text():
|
||||
self.log.warning("Image has been removed.")
|
||||
job.state = 'removed'
|
||||
job.err_str = 'image has been removed because it was a duplicate'
|
||||
db.session.commit()
|
||||
return
|
||||
|
||||
|
||||
err = 0
|
||||
while err < 5:
|
||||
try:
|
||||
imgurl = self.extractMeta(job, soup)
|
||||
if imgurl:
|
||||
self.fetchImage(job, imgurl, pageurl)
|
||||
else:
|
||||
self.log.info("No image found for URL: '%s'", pageurl)
|
||||
job.state = 'error'
|
||||
job.err_str = 'failed to find image!'
|
||||
break
|
||||
except sqlalchemy.exc.IntegrityError:
|
||||
err += 1
|
||||
db.session.rollback()
|
||||
except urllib.error.URLError:
|
||||
job.state = 'error'
|
||||
job.err_str = 'failure fetching actual image'
|
||||
db.session.commit()
|
||||
|
||||
|
||||
|
||||
def retreiveItem(self):
|
||||
job = self.get_job()
|
||||
if not job:
|
||||
return False
|
||||
|
||||
self.processJob(job)
|
||||
return True
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def run(indice):
|
||||
print("Runner {}!".format(indice))
|
||||
fetcher = GelbooruFetcher()
|
||||
remainingTasks = True
|
||||
|
||||
try:
|
||||
while remainingTasks and scraper.runstate.run:
|
||||
remainingTasks = fetcher.retreiveItem()
|
||||
except KeyboardInterrupt:
|
||||
return
|
||||
except:
|
||||
print("Unhandled exception!")
|
||||
traceback.print_exc()
|
||||
raise
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
import logSetup
|
||||
logSetup.initLogging()
|
||||
|
||||
run(1)
|
||||
|
|
@ -0,0 +1,238 @@
|
|||
|
||||
import traceback
|
||||
import urllib.error
|
||||
import urllib.parse
|
||||
import re
|
||||
import time
|
||||
import datetime
|
||||
|
||||
import sqlalchemy.exc
|
||||
import parsedatetime
|
||||
|
||||
import scraper.runstate
|
||||
import scraper.database as db
|
||||
import scraper.fetchBase
|
||||
|
||||
class XBooruFetcher(scraper.fetchBase.AbstractFetcher):
|
||||
|
||||
pluginkey = 'XBooru'
|
||||
loggerpath = "Main.XBooru"
|
||||
content_count_max = 710000
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
# db.session = db.Session()
|
||||
|
||||
|
||||
def extractTags(self, job, tagsection):
|
||||
|
||||
characterlis = tagsection.find_all('li', class_='tag-type-character')
|
||||
specieslis = tagsection.find_all('li', class_='tag-type-species')
|
||||
copyrlis = tagsection.find_all('li', class_='tag-type-copyright')
|
||||
artistlis = tagsection.find_all('li', class_='tag-type-artist')
|
||||
taglis = tagsection.find_all('li', class_='tag-type-general')
|
||||
|
||||
|
||||
tags = []
|
||||
for tagli in taglis:
|
||||
tag = tagli.find_all('a')[-1].get_text()
|
||||
tags.append(tag)
|
||||
|
||||
for speciesli in specieslis:
|
||||
tag = speciesli.find_all('a')[-1].get_text()
|
||||
tags.append("species " + tag)
|
||||
|
||||
for copyrli in copyrlis:
|
||||
tag = copyrli.find_all('a')[-1].get_text()
|
||||
tags.append("copyright " + tag)
|
||||
|
||||
artists = []
|
||||
for artistli in artistlis:
|
||||
artist = artistli.find_all('a')[-1].get_text()
|
||||
artists.append(artist)
|
||||
|
||||
characters = []
|
||||
for characterli in characterlis:
|
||||
character = characterli.find_all('a')[-1].get_text()
|
||||
characters.append(character)
|
||||
|
||||
for tag in tags:
|
||||
if tag not in job.tags:
|
||||
job.tags.append(tag)
|
||||
for artist in artists:
|
||||
if artist not in job.artist:
|
||||
job.artist.append(artist)
|
||||
for character in characters:
|
||||
if character not in job.character:
|
||||
job.character.append(character)
|
||||
|
||||
def getxy(self, instr):
|
||||
found = re.search(r"(\d+)x(\d+)", instr)
|
||||
x, y = found.groups()
|
||||
return x, y
|
||||
|
||||
def extractInfo(self, job, infosection):
|
||||
|
||||
for li in infosection.find_all("li"):
|
||||
rawt = li.get_text().strip()
|
||||
if not rawt:
|
||||
continue
|
||||
if not ":" in rawt:
|
||||
print("rawt: '{}'".format(rawt))
|
||||
|
||||
name, val = rawt.split(":", 1)
|
||||
|
||||
name = name.strip()
|
||||
val = val.strip()
|
||||
|
||||
if name == 'Rating':
|
||||
job.rating = val
|
||||
elif name == 'Favorites':
|
||||
job.favorites = val
|
||||
elif name == 'Score':
|
||||
job.score = val.split()[0]
|
||||
elif name == 'Posted':
|
||||
cal = parsedatetime.Calendar()
|
||||
itemdate = val.split("at")[0]
|
||||
itemdate = itemdate.split("by")[0]
|
||||
print("itemdate", itemdate)
|
||||
tstruct, pstat = cal.parse(itemdate)
|
||||
print("Ret: ", pstat, tstruct)
|
||||
assert pstat == 1 or pstat == 2 or pstat == 3
|
||||
job.posted = datetime.datetime.fromtimestamp(time.mktime(tstruct))
|
||||
elif name == 'Size':
|
||||
job.imgx, job.imgy = self.getxy(val)
|
||||
|
||||
|
||||
elif name == 'Status':
|
||||
job.status = val
|
||||
# Do not try to fetch things that are banned (e.g. removed)
|
||||
if val == 'Banned':
|
||||
job.state = 'removed'
|
||||
job.err_str = 'item banned'
|
||||
elif name in ['Approver', 'Id', 'Source', 'Uploader']:
|
||||
pass
|
||||
else:
|
||||
self.log.warning("Unknown item key-value:")
|
||||
self.log.warning(" '{}' -> '{}'".format(name, val))
|
||||
|
||||
def getImageUrl(self, soup):
|
||||
img = soup.find('a', text='Original image')
|
||||
return img['href']
|
||||
|
||||
def extractMeta(self, job, soup):
|
||||
tagsection = soup.find('ul', id='tag-sidebar')
|
||||
assert tagsection
|
||||
infosection = soup.find('div', id='stats')
|
||||
assert infosection
|
||||
self.extractTags(job, tagsection)
|
||||
self.extractInfo(job, infosection)
|
||||
imgurl = self.getImageUrl(soup)
|
||||
|
||||
return imgurl
|
||||
|
||||
|
||||
|
||||
def fetchImage(self, job, url, srcurl):
|
||||
url = urllib.parse.urljoin(srcurl, url)
|
||||
fname = url.split("/")[-1]
|
||||
|
||||
|
||||
cont = self.wg.getpage(url, addlHeaders={'Referer':srcurl})
|
||||
|
||||
fpath = self.saveFileRow(job, fname, cont)
|
||||
self.log.info("Saved file to path: '%s'", fpath)
|
||||
|
||||
job.filename = fname
|
||||
job.filepath = fpath
|
||||
job.state = 'complete'
|
||||
db.session.commit()
|
||||
# print(fname)
|
||||
|
||||
def processJob(self, job):
|
||||
pageurl = 'http://tbib.org/index.php?page=post&s=view&id={}'.format(job.postid)
|
||||
while 1:
|
||||
try:
|
||||
soup = self.wg.getSoup(pageurl)
|
||||
if 'You are viewing an advertisement' in soup.get_text():
|
||||
self.log.warning("Working around advertisement. Sleeping 10 seconds")
|
||||
time.sleep(13)
|
||||
else:
|
||||
break
|
||||
except urllib.error.URLError:
|
||||
job.state = 'error'
|
||||
job.err_str = 'failure fetching container page'
|
||||
db.session.commit()
|
||||
return
|
||||
|
||||
if 'Gelbooru - Image List' in soup.title.get_text():
|
||||
self.log.warning("Image has been removed.")
|
||||
job.state = 'removed'
|
||||
job.err_str = 'image has been removed'
|
||||
db.session.commit()
|
||||
return
|
||||
|
||||
if 'This post was deleted. Reason: Duplicate of' in soup.get_text():
|
||||
self.log.warning("Image has been removed.")
|
||||
job.state = 'removed'
|
||||
job.err_str = 'image has been removed because it was a duplicate'
|
||||
db.session.commit()
|
||||
return
|
||||
|
||||
|
||||
err = 0
|
||||
while err < 5:
|
||||
try:
|
||||
imgurl = self.extractMeta(job, soup)
|
||||
if imgurl:
|
||||
self.fetchImage(job, imgurl, pageurl)
|
||||
else:
|
||||
self.log.info("No image found for URL: '%s'", pageurl)
|
||||
job.state = 'error'
|
||||
job.err_str = 'failed to find image!'
|
||||
break
|
||||
except sqlalchemy.exc.IntegrityError:
|
||||
err += 1
|
||||
db.session.rollback()
|
||||
except urllib.error.URLError:
|
||||
job.state = 'error'
|
||||
job.err_str = 'failure fetching actual image'
|
||||
db.session.commit()
|
||||
|
||||
|
||||
|
||||
def retreiveItem(self):
|
||||
job = self.get_job()
|
||||
if not job:
|
||||
return False
|
||||
|
||||
self.processJob(job)
|
||||
return True
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def run(indice):
|
||||
print("Runner {}!".format(indice))
|
||||
fetcher = GelbooruFetcher()
|
||||
remainingTasks = True
|
||||
|
||||
try:
|
||||
while remainingTasks and scraper.runstate.run:
|
||||
remainingTasks = fetcher.retreiveItem()
|
||||
except KeyboardInterrupt:
|
||||
return
|
||||
except:
|
||||
print("Unhandled exception!")
|
||||
traceback.print_exc()
|
||||
raise
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
import logSetup
|
||||
logSetup.initLogging()
|
||||
|
||||
run(1)
|
||||
|
|
@ -11,6 +11,8 @@ import scraper.modules.gelbooruFetch
|
|||
import scraper.modules.r34xxxScrape
|
||||
import scraper.modules.KonaChanFetch
|
||||
import scraper.modules.e621Scrape
|
||||
import scraper.modules.tbibFetch
|
||||
import scraper.modules.xbooruFetch
|
||||
|
||||
|
||||
# THREADS = 6
|
||||
|
@ -19,10 +21,12 @@ THREADS = 15
|
|||
|
||||
PLUGIN_CLASSES = [
|
||||
scraper.modules.danbooruFetch.DanbooruFetcher,
|
||||
scraper.modules.gelbooruFetch.GelbooruFetcher,
|
||||
# scraper.modules.gelbooruFetch.GelbooruFetcher,
|
||||
scraper.modules.r34xxxScrape.R34xxxFetcher,
|
||||
scraper.modules.KonaChanFetch.KonaChanFetcher,
|
||||
scraper.modules.e621Scrape.E621Fetcher,
|
||||
scraper.modules.tbibFetch.TbibFetcher,
|
||||
scraper.modules.xbooruFetch.XBooruFetcher,
|
||||
]
|
||||
|
||||
class RunEngine(object):
|
||||
|
|
Loading…
Reference in New Issue