Bringing it baaaaack!
This commit is contained in:
parent
1aa23060c6
commit
5f28db1d8b
|
@ -1,4 +1,9 @@
|
|||
Minimalistic high-volume multi-threaded archive-tool for Danbooru & Gelbooru.
|
||||
Minimalistic high-volume multi-threaded archive-tool for imagegallery sites.
|
||||
|
||||
Currently Supports:
|
||||
- Danbooru
|
||||
- Gelbooru
|
||||
- http://rule34.xxx/
|
||||
|
||||
Written because I needed a extremely large image database *with tags* to
|
||||
use for some experiments with training neural nets.
|
||||
|
@ -15,6 +20,5 @@ Potential ideas:
|
|||
- http://e-shuushuu.net/
|
||||
- https://konachan.com/
|
||||
- https://chan.sankakucomplex.com/
|
||||
- http://rule34.xxx/
|
||||
- http://paheal.net/
|
||||
- https://e621.net/ (furry?)
|
|
@ -196,3 +196,13 @@ def run(indice):
|
|||
print("Unhandled exception!")
|
||||
traceback.print_exc()
|
||||
raise
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
import logSetup
|
||||
logSetup.initLogging()
|
||||
|
||||
run(1)
|
||||
|
||||
|
|
|
@ -102,10 +102,12 @@ class GelbooruFetcher(object):
|
|||
job.score = val.split()[0]
|
||||
elif name == 'Posted':
|
||||
cal = parsedatetime.Calendar()
|
||||
itemdate = val.split("at")[0]
|
||||
itemdate = val.split("by")[0]
|
||||
itemdate = val.split("at")[0]
|
||||
itemdate = itemdate.split("by")[0]
|
||||
print("itemdate", itemdate)
|
||||
tstruct, pstat = cal.parse(itemdate)
|
||||
assert pstat == 1 or pstat == 2
|
||||
print("Ret: ", pstat, tstruct)
|
||||
assert pstat == 1 or pstat == 2 or pstat == 3
|
||||
job.posted = datetime.datetime.fromtimestamp(time.mktime(tstruct))
|
||||
elif name == 'Size':
|
||||
job.imgx, job.imgy = self.getxy(val)
|
||||
|
@ -277,3 +279,11 @@ def run(indice):
|
|||
print("Unhandled exception!")
|
||||
traceback.print_exc()
|
||||
raise
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
import logSetup
|
||||
logSetup.initLogging()
|
||||
|
||||
run(1)
|
||||
|
||||
|
|
106
main.py
106
main.py
|
@ -1,6 +1,9 @@
|
|||
|
||||
import sys
|
||||
import database as db
|
||||
|
||||
from sqlalchemy.dialects.postgresql import insert
|
||||
|
||||
import logSetup
|
||||
logSetup.initLogging()
|
||||
|
||||
|
@ -13,71 +16,26 @@ import concurrent.futures
|
|||
# THREADS = 6
|
||||
THREADS = 30
|
||||
|
||||
def insertDanbooruStartingPoints():
|
||||
UPSERT_STEP = 10000
|
||||
|
||||
tmp = db.session.query(db.Releases) \
|
||||
.filter(db.Releases.postid == 1) \
|
||||
.filter(db.Releases.source == 'Danbooru')
|
||||
def do_upsert(target, maxitems):
|
||||
for x in range(maxitems, 0, UPSERT_STEP * -1):
|
||||
|
||||
tmp = tmp.count()
|
||||
if not tmp:
|
||||
for x in range(2070000):
|
||||
new = db.Releases(dlstate=0, postid=x, source='Danbooru')
|
||||
db.session.add(new)
|
||||
if x % 10000 == 0:
|
||||
print("Loop ", x, "flushing...")
|
||||
db.session.flush()
|
||||
print("Flushed.")
|
||||
db.session.commit()
|
||||
print("[%s] - Building insert data structure %s -> %s" % (target, x, x+UPSERT_STEP))
|
||||
dat = [{"dlstate" : 0, "postid" : x, "source" : target} for x in range(x, x+UPSERT_STEP)]
|
||||
print("[%s] - Building insert query" % target)
|
||||
q = insert(db.Releases).values(dat)
|
||||
q = q.on_conflict_do_nothing()
|
||||
print("[%s] - Built. Doing insert." % target)
|
||||
ret = db.session.execute(q)
|
||||
|
||||
def insertGelbooruStartingPoints():
|
||||
changes = ret.rowcount
|
||||
print("[%s] - Changed rows: %s" % (target, changes))
|
||||
db.session.commit()
|
||||
|
||||
tmp = db.session.query(db.Releases) \
|
||||
.filter(db.Releases.postid == 1) \
|
||||
.filter(db.Releases.source == 'Gelbooru') \
|
||||
.count()
|
||||
if not tmp:
|
||||
|
||||
print("Building insert data structure")
|
||||
dat = [{"dlstate" : 0, "postid" : x, "source" : 'Gelbooru'} for x in range(2900000)]
|
||||
print("Building insert query")
|
||||
q = db.Releases.__table__.insert().values(dat)
|
||||
print("Built. Doing insert.")
|
||||
db.engine.execute(q)
|
||||
print("Done.")
|
||||
# for x in :
|
||||
|
||||
# new = db.Releases(dlstate=0, postid=x, source='Gelbooru')
|
||||
# # db.session.add(new)
|
||||
# if x % 100000 == 0:
|
||||
# print("Loop ", x, "flushing...")
|
||||
# db.session.flush()
|
||||
# print("Flushed.")
|
||||
db.session.commit()
|
||||
def insertR34xxxStartingPoints():
|
||||
|
||||
tmp = db.session.query(db.Releases) \
|
||||
.filter(db.Releases.postid == 1) \
|
||||
.filter(db.Releases.source == 'Rule34.xxx') \
|
||||
.count()
|
||||
if not tmp:
|
||||
|
||||
print("Building insert data structure")
|
||||
dat = [{"dlstate" : 0, "postid" : x, "source" : 'Rule34.xxx'} for x in range(1844200)]
|
||||
print("Building insert query")
|
||||
q = db.Releases.__table__.insert().values(dat)
|
||||
print("Built. Doing insert.")
|
||||
db.engine.execute(q)
|
||||
print("Done.")
|
||||
# for x in :
|
||||
|
||||
# new = db.Releases(dlstate=0, postid=x, source='Gelbooru')
|
||||
# # db.session.add(new)
|
||||
# if x % 100000 == 0:
|
||||
# print("Loop ", x, "flushing...")
|
||||
# db.session.flush()
|
||||
# print("Flushed.")
|
||||
db.session.commit()
|
||||
if not changes:
|
||||
break
|
||||
print("[%s] - Done." % target)
|
||||
|
||||
|
||||
def resetDlstate():
|
||||
|
@ -88,25 +46,29 @@ def resetDlstate():
|
|||
|
||||
|
||||
def go():
|
||||
insertDanbooruStartingPoints()
|
||||
insertGelbooruStartingPoints()
|
||||
insertR34xxxStartingPoints()
|
||||
print("Inserting start URLs")
|
||||
|
||||
do_upsert("Danbooru", 2700000)
|
||||
do_upsert('Gelbooru', 3600000)
|
||||
do_upsert('Rule34.xxx', 2300000)
|
||||
|
||||
print("Resetting DL states.")
|
||||
resetDlstate()
|
||||
|
||||
|
||||
# r34xxxScrape.run(0)
|
||||
|
||||
print("Creating run contexts")
|
||||
executor = concurrent.futures.ThreadPoolExecutor(max_workers=THREADS)
|
||||
try:
|
||||
# for x in range(2):
|
||||
# executor.submit(danbooruFetch.run, 0)
|
||||
# executor.submit(gelbooruFetch.run, 0)
|
||||
for x in range(THREADS):
|
||||
for x in range(THREADS//3):
|
||||
executor.submit(r34xxxScrape.run, x)
|
||||
# for x in range(THREADS//2):
|
||||
# executor.submit(danbooruFetch.run, x)
|
||||
# for x in range(THREADS//2):
|
||||
# executor.submit(gelbooruFetch.run, x)
|
||||
for x in range(THREADS//2):
|
||||
executor.submit(danbooruFetch.run, x)
|
||||
for x in range(THREADS//2):
|
||||
executor.submit(gelbooruFetch.run, x)
|
||||
|
||||
print("Waiting for workers to complete.")
|
||||
executor.shutdown()
|
||||
except KeyboardInterrupt:
|
||||
print("Waiting for executor.")
|
||||
|
|
1
patch.py
1
patch.py
|
@ -59,6 +59,7 @@ def go():
|
|||
proc += 1
|
||||
if proc % 50 == 0:
|
||||
db.session.commit()
|
||||
|
||||
if __name__ == '__main__':
|
||||
go()
|
||||
|
||||
|
|
|
@ -87,7 +87,7 @@ class R34xxxFetcher(danbooruFetch.DanbooruFetcher):
|
|||
cal = parsedatetime.Calendar()
|
||||
val = val.split("by")[0]
|
||||
tstruct, pstat = cal.parse(val)
|
||||
assert pstat == 1 or pstat == 2
|
||||
assert pstat == 1 or pstat == 2 or pstat == 3
|
||||
job.posted = datetime.datetime.fromtimestamp(time.mktime(tstruct))
|
||||
elif name == 'Size':
|
||||
if not '\n' in val:
|
||||
|
@ -141,7 +141,7 @@ class R34xxxFetcher(danbooruFetch.DanbooruFetcher):
|
|||
# print(fname)
|
||||
|
||||
def processJob(self, job):
|
||||
pageurl = 'http://rule34.xxx/index.php?page=post&s=view&id={}'.format(job.postid)
|
||||
pageurl = 'https://rule34.xxx/index.php?page=post&s=view&id={}'.format(job.postid)
|
||||
try:
|
||||
soup = self.wg.getSoup(pageurl)
|
||||
except urllib.error.URLError:
|
||||
|
@ -174,6 +174,7 @@ class R34xxxFetcher(danbooruFetch.DanbooruFetcher):
|
|||
break
|
||||
except AssertionError:
|
||||
self.log.info("Assertion error?: '%s'", pageurl)
|
||||
traceback.print_exc()
|
||||
job.dlstate=-50
|
||||
db.session.rollback()
|
||||
break
|
||||
|
@ -210,3 +211,14 @@ def run(indice):
|
|||
print("Unhandled exception!")
|
||||
traceback.print_exc()
|
||||
raise
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
import logSetup
|
||||
logSetup.initLogging()
|
||||
|
||||
run(1)
|
||||
|
||||
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
DATABASE_USER = "dbarchiver"
|
||||
DATABASE_PASS = "YEkTYt4sCcWctY"
|
||||
DATABASE_DB_NAME = "dbmirror"
|
||||
DATABASE_IP = "10.1.1.8"
|
||||
DATABASE_IP = "10.1.1.61"
|
||||
# Note that a local socket will be tried before the DATABASE_IP value, so if DATABASE_IP is
|
||||
# invalid, it may work anyways.
|
||||
|
||||
|
|
|
@ -201,7 +201,7 @@ class WebGetRobust:
|
|||
if isinstance(page, bytes):
|
||||
raise ValueError("Received content not decoded! Cannot parse!")
|
||||
|
||||
soup = bs4.BeautifulSoup(page)
|
||||
soup = bs4.BeautifulSoup(page, "lxml")
|
||||
return soup
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue