Bringing it baaaaack!

This commit is contained in:
Fake-Name 2017-02-06 22:01:57 -08:00
parent 1aa23060c6
commit 5f28db1d8b
8 changed files with 80 additions and 81 deletions

View File

@ -1,4 +1,9 @@
Minimalistic high-volume multi-threaded archive-tool for Danbooru & Gelbooru.
Minimalistic high-volume multi-threaded archive-tool for imagegallery sites.
Currently Supports:
- Danbooru
- Gelbooru
- http://rule34.xxx/
Written because I needed a extremely large image database *with tags* to
use for some experiments with training neural nets.
@ -15,6 +20,5 @@ Potential ideas:
- http://e-shuushuu.net/
- https://konachan.com/
- https://chan.sankakucomplex.com/
- http://rule34.xxx/
- http://paheal.net/
- https://e621.net/ (furry?)

View File

@ -196,3 +196,13 @@ def run(indice):
print("Unhandled exception!")
traceback.print_exc()
raise
if __name__ == '__main__':
import logSetup
logSetup.initLogging()
run(1)

View File

@ -102,10 +102,12 @@ class GelbooruFetcher(object):
job.score = val.split()[0]
elif name == 'Posted':
cal = parsedatetime.Calendar()
itemdate = val.split("at")[0]
itemdate = val.split("by")[0]
itemdate = val.split("at")[0]
itemdate = itemdate.split("by")[0]
print("itemdate", itemdate)
tstruct, pstat = cal.parse(itemdate)
assert pstat == 1 or pstat == 2
print("Ret: ", pstat, tstruct)
assert pstat == 1 or pstat == 2 or pstat == 3
job.posted = datetime.datetime.fromtimestamp(time.mktime(tstruct))
elif name == 'Size':
job.imgx, job.imgy = self.getxy(val)
@ -277,3 +279,11 @@ def run(indice):
print("Unhandled exception!")
traceback.print_exc()
raise
if __name__ == '__main__':
import logSetup
logSetup.initLogging()
run(1)

106
main.py
View File

@ -1,6 +1,9 @@
import sys
import database as db
from sqlalchemy.dialects.postgresql import insert
import logSetup
logSetup.initLogging()
@ -13,71 +16,26 @@ import concurrent.futures
# THREADS = 6
THREADS = 30
def insertDanbooruStartingPoints():
UPSERT_STEP = 10000
tmp = db.session.query(db.Releases) \
.filter(db.Releases.postid == 1) \
.filter(db.Releases.source == 'Danbooru')
def do_upsert(target, maxitems):
for x in range(maxitems, 0, UPSERT_STEP * -1):
tmp = tmp.count()
if not tmp:
for x in range(2070000):
new = db.Releases(dlstate=0, postid=x, source='Danbooru')
db.session.add(new)
if x % 10000 == 0:
print("Loop ", x, "flushing...")
db.session.flush()
print("Flushed.")
db.session.commit()
print("[%s] - Building insert data structure %s -> %s" % (target, x, x+UPSERT_STEP))
dat = [{"dlstate" : 0, "postid" : x, "source" : target} for x in range(x, x+UPSERT_STEP)]
print("[%s] - Building insert query" % target)
q = insert(db.Releases).values(dat)
q = q.on_conflict_do_nothing()
print("[%s] - Built. Doing insert." % target)
ret = db.session.execute(q)
def insertGelbooruStartingPoints():
changes = ret.rowcount
print("[%s] - Changed rows: %s" % (target, changes))
db.session.commit()
tmp = db.session.query(db.Releases) \
.filter(db.Releases.postid == 1) \
.filter(db.Releases.source == 'Gelbooru') \
.count()
if not tmp:
print("Building insert data structure")
dat = [{"dlstate" : 0, "postid" : x, "source" : 'Gelbooru'} for x in range(2900000)]
print("Building insert query")
q = db.Releases.__table__.insert().values(dat)
print("Built. Doing insert.")
db.engine.execute(q)
print("Done.")
# for x in :
# new = db.Releases(dlstate=0, postid=x, source='Gelbooru')
# # db.session.add(new)
# if x % 100000 == 0:
# print("Loop ", x, "flushing...")
# db.session.flush()
# print("Flushed.")
db.session.commit()
def insertR34xxxStartingPoints():
tmp = db.session.query(db.Releases) \
.filter(db.Releases.postid == 1) \
.filter(db.Releases.source == 'Rule34.xxx') \
.count()
if not tmp:
print("Building insert data structure")
dat = [{"dlstate" : 0, "postid" : x, "source" : 'Rule34.xxx'} for x in range(1844200)]
print("Building insert query")
q = db.Releases.__table__.insert().values(dat)
print("Built. Doing insert.")
db.engine.execute(q)
print("Done.")
# for x in :
# new = db.Releases(dlstate=0, postid=x, source='Gelbooru')
# # db.session.add(new)
# if x % 100000 == 0:
# print("Loop ", x, "flushing...")
# db.session.flush()
# print("Flushed.")
db.session.commit()
if not changes:
break
print("[%s] - Done." % target)
def resetDlstate():
@ -88,25 +46,29 @@ def resetDlstate():
def go():
insertDanbooruStartingPoints()
insertGelbooruStartingPoints()
insertR34xxxStartingPoints()
print("Inserting start URLs")
do_upsert("Danbooru", 2700000)
do_upsert('Gelbooru', 3600000)
do_upsert('Rule34.xxx', 2300000)
print("Resetting DL states.")
resetDlstate()
# r34xxxScrape.run(0)
print("Creating run contexts")
executor = concurrent.futures.ThreadPoolExecutor(max_workers=THREADS)
try:
# for x in range(2):
# executor.submit(danbooruFetch.run, 0)
# executor.submit(gelbooruFetch.run, 0)
for x in range(THREADS):
for x in range(THREADS//3):
executor.submit(r34xxxScrape.run, x)
# for x in range(THREADS//2):
# executor.submit(danbooruFetch.run, x)
# for x in range(THREADS//2):
# executor.submit(gelbooruFetch.run, x)
for x in range(THREADS//2):
executor.submit(danbooruFetch.run, x)
for x in range(THREADS//2):
executor.submit(gelbooruFetch.run, x)
print("Waiting for workers to complete.")
executor.shutdown()
except KeyboardInterrupt:
print("Waiting for executor.")

View File

@ -59,6 +59,7 @@ def go():
proc += 1
if proc % 50 == 0:
db.session.commit()
if __name__ == '__main__':
go()

View File

@ -87,7 +87,7 @@ class R34xxxFetcher(danbooruFetch.DanbooruFetcher):
cal = parsedatetime.Calendar()
val = val.split("by")[0]
tstruct, pstat = cal.parse(val)
assert pstat == 1 or pstat == 2
assert pstat == 1 or pstat == 2 or pstat == 3
job.posted = datetime.datetime.fromtimestamp(time.mktime(tstruct))
elif name == 'Size':
if not '\n' in val:
@ -141,7 +141,7 @@ class R34xxxFetcher(danbooruFetch.DanbooruFetcher):
# print(fname)
def processJob(self, job):
pageurl = 'http://rule34.xxx/index.php?page=post&s=view&id={}'.format(job.postid)
pageurl = 'https://rule34.xxx/index.php?page=post&s=view&id={}'.format(job.postid)
try:
soup = self.wg.getSoup(pageurl)
except urllib.error.URLError:
@ -174,6 +174,7 @@ class R34xxxFetcher(danbooruFetch.DanbooruFetcher):
break
except AssertionError:
self.log.info("Assertion error?: '%s'", pageurl)
traceback.print_exc()
job.dlstate=-50
db.session.rollback()
break
@ -210,3 +211,14 @@ def run(indice):
print("Unhandled exception!")
traceback.print_exc()
raise
if __name__ == '__main__':
import logSetup
logSetup.initLogging()
run(1)

View File

@ -4,7 +4,7 @@
DATABASE_USER = "dbarchiver"
DATABASE_PASS = "YEkTYt4sCcWctY"
DATABASE_DB_NAME = "dbmirror"
DATABASE_IP = "10.1.1.8"
DATABASE_IP = "10.1.1.61"
# Note that a local socket will be tried before the DATABASE_IP value, so if DATABASE_IP is
# invalid, it may work anyways.

View File

@ -201,7 +201,7 @@ class WebGetRobust:
if isinstance(page, bytes):
raise ValueError("Received content not decoded! Cannot parse!")
soup = bs4.BeautifulSoup(page)
soup = bs4.BeautifulSoup(page, "lxml")
return soup