This commit is contained in:
Fake-Name 2017-11-22 23:27:17 -08:00
parent 0570d20d7e
commit 6b5daa3c4a
7 changed files with 517 additions and 20 deletions

View File

@ -54,11 +54,6 @@ class KonaChanFetcher(scraper.fetchBase.AbstractFetcher):
character = characterli.find_all('a')[-1].get_text()
characters.append(character)
# print("Meta:")
# print("tags:", tags)
# print("artists:", artists)
# print("characters:", characters)
for tag in tags:
if tag not in job.tags:
job.tags.append(tag)

View File

@ -27,9 +27,13 @@ class DanbooruFetcher(scraper.fetchBase.AbstractFetcher):
characterlis = tagsection.find_all('li', class_='category-4')
artistlis = tagsection.find_all('li', class_='category-1')
taglis = tagsection.find_all('li', class_='category-0')
copyrlis = tagsection.find_all('li', class_='category-3')
tags = []
for copyrli in copyrlis:
tag = copyrli.find_all('a')[-1].get_text()
tags.append("copyright " + tag)
for tagli in taglis:
tag = tagli.find('a', class_="search-tag").get_text()
tags.append(tag)

View File

@ -24,30 +24,38 @@ class GelbooruFetcher(scraper.fetchBase.AbstractFetcher):
# db.session = db.Session()
def extractTags(self, job, tagsection):
taglis = tagsection.find_all('li', class_='tag-type-general')
characterlis = tagsection.find_all('li', class_='tag-type-character')
specieslis = tagsection.find_all('li', class_='tag-type-species')
copyrlis = tagsection.find_all('li', class_='tag-type-copyright')
artistlis = tagsection.find_all('li', class_='tag-type-artist')
taglis = tagsection.find_all('li', class_='tag-type-general')
tags = []
for tagli in taglis:
tag = tagli.find_all('a')[1].get_text()
tag = tagli.find_all('a')[-1].get_text()
tags.append(tag)
for speciesli in specieslis:
tag = speciesli.find_all('a')[-1].get_text()
tags.append("species " + tag)
for copyrli in copyrlis:
tag = copyrli.find_all('a')[-1].get_text()
tags.append("copyright " + tag)
artists = []
for artistli in artistlis:
artist = artistli.find_all('a')[1].get_text()
artist = artistli.find_all('a')[-1].get_text()
artists.append(artist)
characters = []
for characterli in characterlis:
character = characterli.find_all('a')[1].get_text()
character = characterli.find_all('a')[-1].get_text()
characters.append(character)
for tag in tags:
if tag not in job.tags:
job.tags.append(tag)
@ -113,11 +121,9 @@ class GelbooruFetcher(scraper.fetchBase.AbstractFetcher):
return img['href']
def extractMeta(self, job, soup):
sidebar = soup.find('div', class_='sidebar4').find('div', class_='sidebar3')
tagsection = sidebar.find('ul', id='tag-sidebar')
tagsection = soup.find('ul', id='tag-sidebar')
assert tagsection
infosection = sidebar.find('div', id='stats')
infosection = soup.find('div', id='stats')
assert infosection
self.extractTags(job, tagsection)
self.extractInfo(job, infosection)

View File

@ -25,23 +25,33 @@ class R34xxxFetcher(scraper.fetchBase.AbstractFetcher):
def extractTags(self, job, tagsection):
characterlis = tagsection.find_all('li', class_='tag-type-character')
specieslis = tagsection.find_all('li', class_='tag-type-species')
copyrlis = tagsection.find_all('li', class_='tag-type-copyright')
artistlis = tagsection.find_all('li', class_='tag-type-artist')
taglis = tagsection.find_all('li', class_='tag-type-general')
tags = []
for tagli in taglis:
tag = tagli.find('a').get_text()
tag = tagli.find_all('a')[-1].get_text()
tags.append(tag)
for speciesli in specieslis:
tag = speciesli.find_all('a')[-1].get_text()
tags.append("species " + tag)
for copyrli in copyrlis:
tag = copyrli.find_all('a')[-1].get_text()
tags.append("copyright " + tag)
artists = []
for artistli in artistlis:
artist = artistli.find('a').get_text()
artist = artistli.find_all('a')[-1].get_text()
artists.append(artist)
characters = []
for characterli in characterlis:
character = characterli.find('a').get_text()
character = characterli.find_all('a')[-1].get_text()
characters.append(character)
for tag in tags:

View File

@ -0,0 +1,240 @@
import traceback
import urllib.error
import urllib.parse
import re
import time
import datetime
import sqlalchemy.exc
import parsedatetime
import scraper.runstate
import scraper.database as db
import scraper.fetchBase
class TbibFetcher(scraper.fetchBase.AbstractFetcher):
pluginkey = 'TBIB'
loggerpath = "Main.TBIB"
content_count_max = 6360000
def __init__(self):
super().__init__()
# db.session = db.Session()
def extractTags(self, job, tagsection):
characterlis = tagsection.find_all('li', class_='tag-type-character')
specieslis = tagsection.find_all('li', class_='tag-type-species')
copyrlis = tagsection.find_all('li', class_='tag-type-copyright')
artistlis = tagsection.find_all('li', class_='tag-type-artist')
taglis = tagsection.find_all('li', class_='tag-type-general')
tags = []
for tagli in taglis:
tag = tagli.find_all('a')[-1].get_text()
tags.append(tag)
for speciesli in specieslis:
tag = speciesli.find_all('a')[-1].get_text()
tags.append("species " + tag)
for copyrli in copyrlis:
tag = copyrli.find_all('a')[-1].get_text()
tags.append("copyright " + tag)
artists = []
for artistli in artistlis:
artist = artistli.find_all('a')[-1].get_text()
artists.append(artist)
characters = []
for characterli in characterlis:
character = characterli.find_all('a')[-1].get_text()
characters.append(character)
for tag in tags:
if tag not in job.tags:
job.tags.append(tag)
for artist in artists:
if artist not in job.artist:
job.artist.append(artist)
for character in characters:
if character not in job.character:
job.character.append(character)
def getxy(self, instr):
found = re.search(r"(\d+)x(\d+)", instr)
x, y = found.groups()
return x, y
def extractInfo(self, job, infosection):
for li in infosection.find_all("li"):
rawt = li.get_text().strip()
if not rawt:
continue
if not ":" in rawt:
print("rawt: '{}'".format(rawt))
name, val = rawt.split(":", 1)
name = name.strip()
val = val.strip()
if name == 'Rating':
job.rating = val
elif name == 'Favorites':
job.favorites = val
elif name == 'Score':
job.score = val.split()[0]
elif name == 'Posted':
cal = parsedatetime.Calendar()
itemdate = val.split("at")[0]
itemdate = itemdate.split("by")[0]
print("itemdate", itemdate)
tstruct, pstat = cal.parse(itemdate)
print("Ret: ", pstat, tstruct)
assert pstat == 1 or pstat == 2 or pstat == 3
job.posted = datetime.datetime.fromtimestamp(time.mktime(tstruct))
elif name == 'Size':
job.imgx, job.imgy = self.getxy(val)
elif name == 'Status':
job.status = val
# Do not try to fetch things that are banned (e.g. removed)
if val == 'Banned':
job.state = 'removed'
job.err_str = 'item banned'
elif name in ['Approver', 'Id', 'Source', 'Uploader']:
pass
else:
self.log.warning("Unknown item key-value:")
self.log.warning(" '{}' -> '{}'".format(name, val))
def getImageUrl(self, soup):
img = soup.find('a', text='Original image')
return img['href']
def extractMeta(self, job, soup):
sidebar = soup.find('div', class_='sidebar')
tagsection = sidebar.find('ul', id='tag-sidebar')
assert tagsection
infosection = sidebar.find('div', id='stats')
assert infosection
self.extractTags(job, tagsection)
self.extractInfo(job, infosection)
imgurl = self.getImageUrl(soup)
return imgurl
def fetchImage(self, job, url, srcurl):
url = urllib.parse.urljoin(srcurl, url)
fname = url.split("/")[-1]
cont = self.wg.getpage(url, addlHeaders={'Referer':srcurl})
fpath = self.saveFileRow(job, fname, cont)
self.log.info("Saved file to path: '%s'", fpath)
job.filename = fname
job.filepath = fpath
job.state = 'complete'
db.session.commit()
# print(fname)
def processJob(self, job):
pageurl = 'http://tbib.org/index.php?page=post&s=view&id={}'.format(job.postid)
while 1:
try:
soup = self.wg.getSoup(pageurl)
if 'You are viewing an advertisement' in soup.get_text():
self.log.warning("Working around advertisement. Sleeping 10 seconds")
time.sleep(13)
else:
break
except urllib.error.URLError:
job.state = 'error'
job.err_str = 'failure fetching container page'
db.session.commit()
return
if 'Gelbooru - Image List' in soup.title.get_text():
self.log.warning("Image has been removed.")
job.state = 'removed'
job.err_str = 'image has been removed'
db.session.commit()
return
if 'This post was deleted. Reason: Duplicate of' in soup.get_text():
self.log.warning("Image has been removed.")
job.state = 'removed'
job.err_str = 'image has been removed because it was a duplicate'
db.session.commit()
return
err = 0
while err < 5:
try:
imgurl = self.extractMeta(job, soup)
if imgurl:
self.fetchImage(job, imgurl, pageurl)
else:
self.log.info("No image found for URL: '%s'", pageurl)
job.state = 'error'
job.err_str = 'failed to find image!'
break
except sqlalchemy.exc.IntegrityError:
err += 1
db.session.rollback()
except urllib.error.URLError:
job.state = 'error'
job.err_str = 'failure fetching actual image'
db.session.commit()
def retreiveItem(self):
job = self.get_job()
if not job:
return False
self.processJob(job)
return True
def run(indice):
print("Runner {}!".format(indice))
fetcher = GelbooruFetcher()
remainingTasks = True
try:
while remainingTasks and scraper.runstate.run:
remainingTasks = fetcher.retreiveItem()
except KeyboardInterrupt:
return
except:
print("Unhandled exception!")
traceback.print_exc()
raise
if __name__ == '__main__':
import logSetup
logSetup.initLogging()
run(1)

View File

@ -0,0 +1,238 @@
import traceback
import urllib.error
import urllib.parse
import re
import time
import datetime
import sqlalchemy.exc
import parsedatetime
import scraper.runstate
import scraper.database as db
import scraper.fetchBase
class XBooruFetcher(scraper.fetchBase.AbstractFetcher):
pluginkey = 'XBooru'
loggerpath = "Main.XBooru"
content_count_max = 710000
def __init__(self):
super().__init__()
# db.session = db.Session()
def extractTags(self, job, tagsection):
characterlis = tagsection.find_all('li', class_='tag-type-character')
specieslis = tagsection.find_all('li', class_='tag-type-species')
copyrlis = tagsection.find_all('li', class_='tag-type-copyright')
artistlis = tagsection.find_all('li', class_='tag-type-artist')
taglis = tagsection.find_all('li', class_='tag-type-general')
tags = []
for tagli in taglis:
tag = tagli.find_all('a')[-1].get_text()
tags.append(tag)
for speciesli in specieslis:
tag = speciesli.find_all('a')[-1].get_text()
tags.append("species " + tag)
for copyrli in copyrlis:
tag = copyrli.find_all('a')[-1].get_text()
tags.append("copyright " + tag)
artists = []
for artistli in artistlis:
artist = artistli.find_all('a')[-1].get_text()
artists.append(artist)
characters = []
for characterli in characterlis:
character = characterli.find_all('a')[-1].get_text()
characters.append(character)
for tag in tags:
if tag not in job.tags:
job.tags.append(tag)
for artist in artists:
if artist not in job.artist:
job.artist.append(artist)
for character in characters:
if character not in job.character:
job.character.append(character)
def getxy(self, instr):
found = re.search(r"(\d+)x(\d+)", instr)
x, y = found.groups()
return x, y
def extractInfo(self, job, infosection):
for li in infosection.find_all("li"):
rawt = li.get_text().strip()
if not rawt:
continue
if not ":" in rawt:
print("rawt: '{}'".format(rawt))
name, val = rawt.split(":", 1)
name = name.strip()
val = val.strip()
if name == 'Rating':
job.rating = val
elif name == 'Favorites':
job.favorites = val
elif name == 'Score':
job.score = val.split()[0]
elif name == 'Posted':
cal = parsedatetime.Calendar()
itemdate = val.split("at")[0]
itemdate = itemdate.split("by")[0]
print("itemdate", itemdate)
tstruct, pstat = cal.parse(itemdate)
print("Ret: ", pstat, tstruct)
assert pstat == 1 or pstat == 2 or pstat == 3
job.posted = datetime.datetime.fromtimestamp(time.mktime(tstruct))
elif name == 'Size':
job.imgx, job.imgy = self.getxy(val)
elif name == 'Status':
job.status = val
# Do not try to fetch things that are banned (e.g. removed)
if val == 'Banned':
job.state = 'removed'
job.err_str = 'item banned'
elif name in ['Approver', 'Id', 'Source', 'Uploader']:
pass
else:
self.log.warning("Unknown item key-value:")
self.log.warning(" '{}' -> '{}'".format(name, val))
def getImageUrl(self, soup):
img = soup.find('a', text='Original image')
return img['href']
def extractMeta(self, job, soup):
tagsection = soup.find('ul', id='tag-sidebar')
assert tagsection
infosection = soup.find('div', id='stats')
assert infosection
self.extractTags(job, tagsection)
self.extractInfo(job, infosection)
imgurl = self.getImageUrl(soup)
return imgurl
def fetchImage(self, job, url, srcurl):
url = urllib.parse.urljoin(srcurl, url)
fname = url.split("/")[-1]
cont = self.wg.getpage(url, addlHeaders={'Referer':srcurl})
fpath = self.saveFileRow(job, fname, cont)
self.log.info("Saved file to path: '%s'", fpath)
job.filename = fname
job.filepath = fpath
job.state = 'complete'
db.session.commit()
# print(fname)
def processJob(self, job):
pageurl = 'http://tbib.org/index.php?page=post&s=view&id={}'.format(job.postid)
while 1:
try:
soup = self.wg.getSoup(pageurl)
if 'You are viewing an advertisement' in soup.get_text():
self.log.warning("Working around advertisement. Sleeping 10 seconds")
time.sleep(13)
else:
break
except urllib.error.URLError:
job.state = 'error'
job.err_str = 'failure fetching container page'
db.session.commit()
return
if 'Gelbooru - Image List' in soup.title.get_text():
self.log.warning("Image has been removed.")
job.state = 'removed'
job.err_str = 'image has been removed'
db.session.commit()
return
if 'This post was deleted. Reason: Duplicate of' in soup.get_text():
self.log.warning("Image has been removed.")
job.state = 'removed'
job.err_str = 'image has been removed because it was a duplicate'
db.session.commit()
return
err = 0
while err < 5:
try:
imgurl = self.extractMeta(job, soup)
if imgurl:
self.fetchImage(job, imgurl, pageurl)
else:
self.log.info("No image found for URL: '%s'", pageurl)
job.state = 'error'
job.err_str = 'failed to find image!'
break
except sqlalchemy.exc.IntegrityError:
err += 1
db.session.rollback()
except urllib.error.URLError:
job.state = 'error'
job.err_str = 'failure fetching actual image'
db.session.commit()
def retreiveItem(self):
job = self.get_job()
if not job:
return False
self.processJob(job)
return True
def run(indice):
print("Runner {}!".format(indice))
fetcher = GelbooruFetcher()
remainingTasks = True
try:
while remainingTasks and scraper.runstate.run:
remainingTasks = fetcher.retreiveItem()
except KeyboardInterrupt:
return
except:
print("Unhandled exception!")
traceback.print_exc()
raise
if __name__ == '__main__':
import logSetup
logSetup.initLogging()
run(1)

View File

@ -11,6 +11,8 @@ import scraper.modules.gelbooruFetch
import scraper.modules.r34xxxScrape
import scraper.modules.KonaChanFetch
import scraper.modules.e621Scrape
import scraper.modules.tbibFetch
import scraper.modules.xbooruFetch
# THREADS = 6
@ -19,10 +21,12 @@ THREADS = 15
PLUGIN_CLASSES = [
scraper.modules.danbooruFetch.DanbooruFetcher,
scraper.modules.gelbooruFetch.GelbooruFetcher,
# scraper.modules.gelbooruFetch.GelbooruFetcher,
scraper.modules.r34xxxScrape.R34xxxFetcher,
scraper.modules.KonaChanFetch.KonaChanFetcher,
scraper.modules.e621Scrape.E621Fetcher,
scraper.modules.tbibFetch.TbibFetcher,
scraper.modules.xbooruFetch.XBooruFetcher,
]
class RunEngine(object):