diff --git a/.gitignore b/.gitignore index eb01a94..89b110d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ /__pycache__/*.pyc /*.lwp /logs +/*.pyc diff --git a/database.py b/database.py index df22331..6c1f0d7 100644 --- a/database.py +++ b/database.py @@ -33,8 +33,9 @@ from settings import DATABASE_PASS as C_DATABASE_PASS SQLALCHEMY_DATABASE_URI = 'postgresql://{user}:{passwd}@{host}:5432/{database}'.format(user=C_DATABASE_USER, passwd=C_DATABASE_PASS, host=C_DATABASE_IP, database=C_DATABASE_DB_NAME) +# I was having issues with timeouts because the default connection pool is 5 connections. +engine = create_engine(SQLALCHEMY_DATABASE_URI, pool_size = 20) -engine = create_engine(SQLALCHEMY_DATABASE_URI) SessionFactory = sessionmaker(bind=engine) session = scoped_session(SessionFactory) # session = Session() @@ -197,7 +198,7 @@ class Releases(Base): file = association_proxy('file_rel', 'files', creator=file_creator) __table_args__ = ( - UniqueConstraint('postid'), + UniqueConstraint('postid', 'source'), ) diff --git a/gelbooruFetch.py b/gelbooruFetch.py index 5092d72..036e951 100644 --- a/gelbooruFetch.py +++ b/gelbooruFetch.py @@ -15,9 +15,9 @@ import os.path import time import datetime -class DanbooruFetcher(object): +class GelbooruFetcher(object): def __init__(self): - self.log = logging.getLogger("Main.Danbooru") + self.log = logging.getLogger("Main.Gelbooru") self.wg = webFunctions.WebGetRobust() # db.session = db.Session() @@ -44,26 +44,27 @@ class DanbooruFetcher(object): def extractTags(self, job, tagsection): - characterlis = tagsection.find_all('li', class_='category-4') - artistlis = tagsection.find_all('li', class_='category-1') - taglis = tagsection.find_all('li', class_='category-0') + taglis = tagsection.find_all('li', class_='tag-type-general') + characterlis = tagsection.find_all('li', class_='tag-type-character') + artistlis = tagsection.find_all('li', class_='tag-type-artist') tags = [] for tagli in taglis: - tag = tagli.find('a', class_="search-tag").get_text() + tag = tagli.find_all('a')[1].get_text() tags.append(tag) artists = [] for artistli in artistlis: - artist = artistli.find('a', class_="search-tag").get_text() + artist = artistli.find_all('a')[1].get_text() artists.append(artist) characters = [] for characterli in characterlis: - character = characterli.find('a', class_="search-tag").get_text() + character = characterli.find_all('a')[1].get_text() characters.append(character) + for tag in tags: if tag not in job.tags: job.tags.append(tag) @@ -75,14 +76,19 @@ class DanbooruFetcher(object): job.character.append(character) def getxy(self, instr): - found = re.search(r"\((\d+)x(\d+)\)", instr) + found = re.search(r"(\d+)x(\d+)", instr) x, y = found.groups() return x, y def extractInfo(self, job, infosection): - imgurl = None + for li in infosection.find_all("li"): - rawt = li.get_text() + rawt = li.get_text().strip() + if not rawt: + continue + if not ":" in rawt: + print("rawt: '{}'".format(rawt)) + name, val = rawt.split(":", 1) name = name.strip() @@ -93,41 +99,43 @@ class DanbooruFetcher(object): elif name == 'Favorites': job.favorites = val elif name == 'Score': - job.score = val - elif name == 'Date': + job.score = val.split()[0] + elif name == 'Posted': cal = parsedatetime.Calendar() - tstruct, pstat = cal.parse(val) + itemdate = val.split("at")[0] + tstruct, pstat = cal.parse(itemdate) assert pstat == 1 or pstat == 2 job.posted = datetime.datetime.fromtimestamp(time.mktime(tstruct)) elif name == 'Size': - if not '\n' in val: - return False - fsize, res = val.split("\n") - fsize, res = fsize.strip(), res.strip() - job.imgx, job.imgy = self.getxy(res) + job.imgx, job.imgy = self.getxy(val) - link = li.find("a") - if link: - imgurl = link['href'] elif name == 'Status': job.status = val # Do not try to fetch things that are banned (e.g. removed) if val == 'Banned': job.dlstate=-2 - elif name in ['Approver', 'ID', 'Source', 'Uploader']: + elif name in ['Approver', 'Id', 'Source', 'Uploader']: pass else: self.log.warning("Unknown item key-value:") self.log.warning(" '{}' -> '{}'".format(name, val)) - return imgurl + + def getImageUrl(self, soup): + img = soup.find('a', text='Original image') + return img['href'] + def extractMeta(self, job, soup): - tagsection = soup.find('section', id='tag-list') + sidebar = soup.find('div', class_='sidebar4').find('div', class_='sidebar3') + + tagsection = sidebar.find('ul', id='tag-sidebar') assert tagsection - infosection = soup.find('section', id='post-information') + infosection = sidebar.find('div', id='stats') assert infosection self.extractTags(job, tagsection) - imgurl = self.extractInfo(job, infosection) + self.extractInfo(job, infosection) + imgurl = self.getImageUrl(soup) + return imgurl @@ -191,23 +199,32 @@ class DanbooruFetcher(object): # print(fname) def processJob(self, job): - pageurl = 'https://danbooru.donmai.us/posts/{}'.format(job.postid) - try: - soup = self.wg.getSoup(pageurl) - except urllib.error.URLError: - job.dlstate=-1 - db.session.commit() - return + pageurl = 'http://gelbooru.com/index.php?page=post&s=view&id={}'.format(job.postid) + while 1: + try: + soup = self.wg.getSoup(pageurl) + if 'You are viewing an advertisement' in soup.get_text(): + self.log.warning("Working around advertisement. Sleeping 10 seconds") + time.sleep(10) + else: + break + except urllib.error.URLError: + job.dlstate=-1 + db.session.commit() + return - text = soup.get_text() - if 'You need a gold account to see this image.' in text: - job.dlstate=-3 - db.session.commit() - return - if 'This post was deleted for the following reasons' in text: + if 'Gelbooru - Image List' in soup.title.get_text(): + self.log.warn("Image has been removed.") job.dlstate=-4 db.session.commit() return + + # text = soup.get_text() + # if 'You need a gold account to see this image.' in text: + # job.dlstate=-3 + # db.session.commit() + # return + err = 0 while err < 5: try: @@ -238,7 +255,7 @@ class DanbooruFetcher(object): def run(indice): print("Runner {}!".format(indice)) - fetcher = DanbooruFetcher() + fetcher = GelbooruFetcher() remainingTasks = True try: diff --git a/main.py b/main.py index b79ddbe..aa593ca 100644 --- a/main.py +++ b/main.py @@ -1,19 +1,22 @@ +import sys import database as db import logSetup logSetup.initLogging() import danbooruFetch +import gelbooruFetch import runstate import concurrent.futures THREADS = 1 -THREADS = 25 +THREADS = 30 def insertDanbooruStartingPoints(): - tmp = db.session.query(db.Releases) \ - .filter(db.Releases.postid == 1) \ + tmp = db.session.query(db.Releases) \ + .filter(db.Releases.postid == 1) \ + .filter(db.Releases.source == 'Danbooru') \ .count() if not tmp: for x in range(2070000): @@ -25,6 +28,31 @@ def insertDanbooruStartingPoints(): print("Flushed.") db.session.commit() +def insertGelbooruStartingPoints(): + + tmp = db.session.query(db.Releases) \ + .filter(db.Releases.postid == 1) \ + .filter(db.Releases.source == 'Gelbooru') \ + .count() + if not tmp: + + print("Building insert data structure") + dat = [{"dlstate" : 0, "postid" : x, "source" : 'Gelbooru'} for x in range(2900000)] + print("Building insert query") + q = db.Releases.__table__.insert().values(dat) + print("Built. Doing insert.") + db.engine.execute(q) + print("Done.") + # for x in : + + # new = db.Releases(dlstate=0, postid=x, source='Gelbooru') + # # db.session.add(new) + # if x % 100000 == 0: + # print("Loop ", x, "flushing...") + # db.session.flush() + # print("Flushed.") + db.session.commit() + def resetDlstate(): tmp = db.session.query(db.Releases) \ @@ -35,13 +63,19 @@ def resetDlstate(): def go(): insertDanbooruStartingPoints() + insertGelbooruStartingPoints() resetDlstate() + + + executor = concurrent.futures.ThreadPoolExecutor(max_workers=THREADS) try: # for x in range(2): - for x in range(THREADS): + for x in range(THREADS//2): executor.submit(danbooruFetch.run, x) + for x in range(THREADS//2): + executor.submit(gelbooruFetch.run, x) executor.shutdown() except KeyboardInterrupt: print("Waiting for executor.")