From 5f28db1d8b57baeabe9c3a59c3d599076e3f6ea2 Mon Sep 17 00:00:00 2001 From: Fake-Name Date: Mon, 6 Feb 2017 22:01:57 -0800 Subject: [PATCH] Bringing it baaaaack! --- README.md | 8 +++- danbooruFetch.py | 10 +++++ gelbooruFetch.py | 16 +++++-- main.py | 106 +++++++++++++++-------------------------------- patch.py | 1 + r34xxxScrape.py | 16 ++++++- settings.py | 2 +- webFunctions.py | 2 +- 8 files changed, 80 insertions(+), 81 deletions(-) diff --git a/README.md b/README.md index 77645b1..fccada7 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,9 @@ -Minimalistic high-volume multi-threaded archive-tool for Danbooru & Gelbooru. +Minimalistic high-volume multi-threaded archive-tool for imagegallery sites. + +Currently Supports: + - Danbooru + - Gelbooru + - http://rule34.xxx/ Written because I needed a extremely large image database *with tags* to use for some experiments with training neural nets. @@ -15,6 +20,5 @@ Potential ideas: - http://e-shuushuu.net/ - https://konachan.com/ - https://chan.sankakucomplex.com/ - - http://rule34.xxx/ - http://paheal.net/ - https://e621.net/ (furry?) \ No newline at end of file diff --git a/danbooruFetch.py b/danbooruFetch.py index f68d728..75ccc74 100644 --- a/danbooruFetch.py +++ b/danbooruFetch.py @@ -196,3 +196,13 @@ def run(indice): print("Unhandled exception!") traceback.print_exc() raise + + + +if __name__ == '__main__': + + import logSetup + logSetup.initLogging() + + run(1) + diff --git a/gelbooruFetch.py b/gelbooruFetch.py index 4936f87..c6ce06e 100644 --- a/gelbooruFetch.py +++ b/gelbooruFetch.py @@ -102,10 +102,12 @@ class GelbooruFetcher(object): job.score = val.split()[0] elif name == 'Posted': cal = parsedatetime.Calendar() - itemdate = val.split("at")[0] - itemdate = val.split("by")[0] + itemdate = val.split("at")[0] + itemdate = itemdate.split("by")[0] + print("itemdate", itemdate) tstruct, pstat = cal.parse(itemdate) - assert pstat == 1 or pstat == 2 + print("Ret: ", pstat, tstruct) + assert pstat == 1 or pstat == 2 or pstat == 3 job.posted = datetime.datetime.fromtimestamp(time.mktime(tstruct)) elif name == 'Size': job.imgx, job.imgy = self.getxy(val) @@ -277,3 +279,11 @@ def run(indice): print("Unhandled exception!") traceback.print_exc() raise + +if __name__ == '__main__': + + import logSetup + logSetup.initLogging() + + run(1) + diff --git a/main.py b/main.py index 3ad307b..df75333 100644 --- a/main.py +++ b/main.py @@ -1,6 +1,9 @@ import sys import database as db + +from sqlalchemy.dialects.postgresql import insert + import logSetup logSetup.initLogging() @@ -13,71 +16,26 @@ import concurrent.futures # THREADS = 6 THREADS = 30 -def insertDanbooruStartingPoints(): +UPSERT_STEP = 10000 - tmp = db.session.query(db.Releases) \ - .filter(db.Releases.postid == 1) \ - .filter(db.Releases.source == 'Danbooru') +def do_upsert(target, maxitems): + for x in range(maxitems, 0, UPSERT_STEP * -1): - tmp = tmp.count() - if not tmp: - for x in range(2070000): - new = db.Releases(dlstate=0, postid=x, source='Danbooru') - db.session.add(new) - if x % 10000 == 0: - print("Loop ", x, "flushing...") - db.session.flush() - print("Flushed.") - db.session.commit() + print("[%s] - Building insert data structure %s -> %s" % (target, x, x+UPSERT_STEP)) + dat = [{"dlstate" : 0, "postid" : x, "source" : target} for x in range(x, x+UPSERT_STEP)] + print("[%s] - Building insert query" % target) + q = insert(db.Releases).values(dat) + q = q.on_conflict_do_nothing() + print("[%s] - Built. Doing insert." % target) + ret = db.session.execute(q) -def insertGelbooruStartingPoints(): + changes = ret.rowcount + print("[%s] - Changed rows: %s" % (target, changes)) + db.session.commit() - tmp = db.session.query(db.Releases) \ - .filter(db.Releases.postid == 1) \ - .filter(db.Releases.source == 'Gelbooru') \ - .count() - if not tmp: - - print("Building insert data structure") - dat = [{"dlstate" : 0, "postid" : x, "source" : 'Gelbooru'} for x in range(2900000)] - print("Building insert query") - q = db.Releases.__table__.insert().values(dat) - print("Built. Doing insert.") - db.engine.execute(q) - print("Done.") - # for x in : - - # new = db.Releases(dlstate=0, postid=x, source='Gelbooru') - # # db.session.add(new) - # if x % 100000 == 0: - # print("Loop ", x, "flushing...") - # db.session.flush() - # print("Flushed.") - db.session.commit() -def insertR34xxxStartingPoints(): - - tmp = db.session.query(db.Releases) \ - .filter(db.Releases.postid == 1) \ - .filter(db.Releases.source == 'Rule34.xxx') \ - .count() - if not tmp: - - print("Building insert data structure") - dat = [{"dlstate" : 0, "postid" : x, "source" : 'Rule34.xxx'} for x in range(1844200)] - print("Building insert query") - q = db.Releases.__table__.insert().values(dat) - print("Built. Doing insert.") - db.engine.execute(q) - print("Done.") - # for x in : - - # new = db.Releases(dlstate=0, postid=x, source='Gelbooru') - # # db.session.add(new) - # if x % 100000 == 0: - # print("Loop ", x, "flushing...") - # db.session.flush() - # print("Flushed.") - db.session.commit() + if not changes: + break + print("[%s] - Done." % target) def resetDlstate(): @@ -88,25 +46,29 @@ def resetDlstate(): def go(): - insertDanbooruStartingPoints() - insertGelbooruStartingPoints() - insertR34xxxStartingPoints() + print("Inserting start URLs") + + do_upsert("Danbooru", 2700000) + do_upsert('Gelbooru', 3600000) + do_upsert('Rule34.xxx', 2300000) + + print("Resetting DL states.") resetDlstate() - - # r34xxxScrape.run(0) - + print("Creating run contexts") executor = concurrent.futures.ThreadPoolExecutor(max_workers=THREADS) try: # for x in range(2): # executor.submit(danbooruFetch.run, 0) # executor.submit(gelbooruFetch.run, 0) - for x in range(THREADS): + for x in range(THREADS//3): executor.submit(r34xxxScrape.run, x) - # for x in range(THREADS//2): - # executor.submit(danbooruFetch.run, x) - # for x in range(THREADS//2): - # executor.submit(gelbooruFetch.run, x) + for x in range(THREADS//2): + executor.submit(danbooruFetch.run, x) + for x in range(THREADS//2): + executor.submit(gelbooruFetch.run, x) + + print("Waiting for workers to complete.") executor.shutdown() except KeyboardInterrupt: print("Waiting for executor.") diff --git a/patch.py b/patch.py index 9be98c3..90c52bf 100644 --- a/patch.py +++ b/patch.py @@ -59,6 +59,7 @@ def go(): proc += 1 if proc % 50 == 0: db.session.commit() + if __name__ == '__main__': go() diff --git a/r34xxxScrape.py b/r34xxxScrape.py index 1c6971d..86aa023 100644 --- a/r34xxxScrape.py +++ b/r34xxxScrape.py @@ -87,7 +87,7 @@ class R34xxxFetcher(danbooruFetch.DanbooruFetcher): cal = parsedatetime.Calendar() val = val.split("by")[0] tstruct, pstat = cal.parse(val) - assert pstat == 1 or pstat == 2 + assert pstat == 1 or pstat == 2 or pstat == 3 job.posted = datetime.datetime.fromtimestamp(time.mktime(tstruct)) elif name == 'Size': if not '\n' in val: @@ -141,7 +141,7 @@ class R34xxxFetcher(danbooruFetch.DanbooruFetcher): # print(fname) def processJob(self, job): - pageurl = 'http://rule34.xxx/index.php?page=post&s=view&id={}'.format(job.postid) + pageurl = 'https://rule34.xxx/index.php?page=post&s=view&id={}'.format(job.postid) try: soup = self.wg.getSoup(pageurl) except urllib.error.URLError: @@ -174,6 +174,7 @@ class R34xxxFetcher(danbooruFetch.DanbooruFetcher): break except AssertionError: self.log.info("Assertion error?: '%s'", pageurl) + traceback.print_exc() job.dlstate=-50 db.session.rollback() break @@ -210,3 +211,14 @@ def run(indice): print("Unhandled exception!") traceback.print_exc() raise + + + +if __name__ == '__main__': + + import logSetup + logSetup.initLogging() + + run(1) + + diff --git a/settings.py b/settings.py index c55d8b5..75de43c 100644 --- a/settings.py +++ b/settings.py @@ -4,7 +4,7 @@ DATABASE_USER = "dbarchiver" DATABASE_PASS = "YEkTYt4sCcWctY" DATABASE_DB_NAME = "dbmirror" -DATABASE_IP = "10.1.1.8" +DATABASE_IP = "10.1.1.61" # Note that a local socket will be tried before the DATABASE_IP value, so if DATABASE_IP is # invalid, it may work anyways. diff --git a/webFunctions.py b/webFunctions.py index b8de4bb..e471a2d 100644 --- a/webFunctions.py +++ b/webFunctions.py @@ -201,7 +201,7 @@ class WebGetRobust: if isinstance(page, bytes): raise ValueError("Received content not decoded! Cannot parse!") - soup = bs4.BeautifulSoup(page) + soup = bs4.BeautifulSoup(page, "lxml") return soup