From 5f28db1d8b57baeabe9c3a59c3d599076e3f6ea2 Mon Sep 17 00:00:00 2001
From: Fake-Name <something@fake-url.com>
Date: Mon, 6 Feb 2017 22:01:57 -0800
Subject: [PATCH] Bringing it baaaaack!

---
 README.md        |   8 +++-
 danbooruFetch.py |  10 +++++
 gelbooruFetch.py |  16 +++++--
 main.py          | 106 +++++++++++++++--------------------------------
 patch.py         |   1 +
 r34xxxScrape.py  |  16 ++++++-
 settings.py      |   2 +-
 webFunctions.py  |   2 +-
 8 files changed, 80 insertions(+), 81 deletions(-)

diff --git a/README.md b/README.md
index 77645b1..fccada7 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,9 @@
-Minimalistic high-volume multi-threaded archive-tool for Danbooru & Gelbooru.
+Minimalistic high-volume multi-threaded archive-tool for imagegallery sites.
+
+Currently Supports:
+ - Danbooru
+ - Gelbooru
+ - http://rule34.xxx/
 
 Written because I needed a extremely large image database *with tags* to 
 use for some experiments with training neural nets.
@@ -15,6 +20,5 @@ Potential ideas:
 	 - http://e-shuushuu.net/
 	 - https://konachan.com/
 	 - https://chan.sankakucomplex.com/
-	 - http://rule34.xxx/
 	 - http://paheal.net/
 	 - https://e621.net/ (furry?)
\ No newline at end of file
diff --git a/danbooruFetch.py b/danbooruFetch.py
index f68d728..75ccc74 100644
--- a/danbooruFetch.py
+++ b/danbooruFetch.py
@@ -196,3 +196,13 @@ def run(indice):
 		print("Unhandled exception!")
 		traceback.print_exc()
 		raise
+
+
+
+if __name__ == '__main__':
+
+	import logSetup
+	logSetup.initLogging()
+
+	run(1)
+
diff --git a/gelbooruFetch.py b/gelbooruFetch.py
index 4936f87..c6ce06e 100644
--- a/gelbooruFetch.py
+++ b/gelbooruFetch.py
@@ -102,10 +102,12 @@ class GelbooruFetcher(object):
 				job.score = val.split()[0]
 			elif name == 'Posted':
 				cal = parsedatetime.Calendar()
-				itemdate = val.split("at")[0]
-				itemdate = val.split("by")[0]
+				itemdate =      val.split("at")[0]
+				itemdate = itemdate.split("by")[0]
+				print("itemdate", itemdate)
 				tstruct, pstat = cal.parse(itemdate)
-				assert pstat == 1 or pstat == 2
+				print("Ret: ", pstat, tstruct)
+				assert pstat == 1 or pstat == 2 or pstat == 3
 				job.posted = datetime.datetime.fromtimestamp(time.mktime(tstruct))
 			elif name == 'Size':
 				job.imgx, job.imgy = self.getxy(val)
@@ -277,3 +279,11 @@ def run(indice):
 		print("Unhandled exception!")
 		traceback.print_exc()
 		raise
+
+if __name__ == '__main__':
+
+	import logSetup
+	logSetup.initLogging()
+
+	run(1)
+
diff --git a/main.py b/main.py
index 3ad307b..df75333 100644
--- a/main.py
+++ b/main.py
@@ -1,6 +1,9 @@
 
 import sys
 import database as db
+
+from sqlalchemy.dialects.postgresql import insert
+
 import logSetup
 logSetup.initLogging()
 
@@ -13,71 +16,26 @@ import concurrent.futures
 # THREADS = 6
 THREADS = 30
 
-def insertDanbooruStartingPoints():
+UPSERT_STEP = 10000
 
-	tmp = db.session.query(db.Releases)           \
-		.filter(db.Releases.postid == 1)          \
-		.filter(db.Releases.source == 'Danbooru')
+def do_upsert(target, maxitems):
+	for x in range(maxitems, 0, UPSERT_STEP * -1):
 
-	tmp = tmp.count()
-	if not tmp:
-		for x in range(2070000):
-			new = db.Releases(dlstate=0, postid=x, source='Danbooru')
-			db.session.add(new)
-			if x % 10000 == 0:
-				print("Loop ", x, "flushing...")
-				db.session.flush()
-				print("Flushed.")
-	db.session.commit()
+		print("[%s] - Building insert data structure %s -> %s" % (target, x, x+UPSERT_STEP))
+		dat = [{"dlstate" : 0, "postid" : x, "source" : target} for x in range(x, x+UPSERT_STEP)]
+		print("[%s] - Building insert query" % target)
+		q = insert(db.Releases).values(dat)
+		q = q.on_conflict_do_nothing()
+		print("[%s] - Built. Doing insert." % target)
+		ret = db.session.execute(q)
 
-def insertGelbooruStartingPoints():
+		changes = ret.rowcount
+		print("[%s] - Changed rows: %s" % (target, changes))
+		db.session.commit()
 
-	tmp = db.session.query(db.Releases)           \
-		.filter(db.Releases.postid == 1)          \
-		.filter(db.Releases.source == 'Gelbooru') \
-		.count()
-	if not tmp:
-
-		print("Building insert data structure")
-		dat = [{"dlstate" : 0, "postid" : x, "source" : 'Gelbooru'} for x in range(2900000)]
-		print("Building insert query")
-		q = db.Releases.__table__.insert().values(dat)
-		print("Built. Doing insert.")
-		db.engine.execute(q)
-		print("Done.")
-		# for x in :
-
-			# new = db.Releases(dlstate=0, postid=x, source='Gelbooru')
-			# # db.session.add(new)
-			# if x % 100000 == 0:
-			# 	print("Loop ", x, "flushing...")
-			# 	db.session.flush()
-			# 	print("Flushed.")
-	db.session.commit()
-def insertR34xxxStartingPoints():
-
-	tmp = db.session.query(db.Releases)           \
-		.filter(db.Releases.postid == 1)          \
-		.filter(db.Releases.source == 'Rule34.xxx') \
-		.count()
-	if not tmp:
-
-		print("Building insert data structure")
-		dat = [{"dlstate" : 0, "postid" : x, "source" : 'Rule34.xxx'} for x in range(1844200)]
-		print("Building insert query")
-		q = db.Releases.__table__.insert().values(dat)
-		print("Built. Doing insert.")
-		db.engine.execute(q)
-		print("Done.")
-		# for x in :
-
-			# new = db.Releases(dlstate=0, postid=x, source='Gelbooru')
-			# # db.session.add(new)
-			# if x % 100000 == 0:
-			# 	print("Loop ", x, "flushing...")
-			# 	db.session.flush()
-			# 	print("Flushed.")
-	db.session.commit()
+		if not changes:
+			break
+	print("[%s] - Done." % target)
 
 
 def resetDlstate():
@@ -88,25 +46,29 @@ def resetDlstate():
 
 
 def go():
-	insertDanbooruStartingPoints()
-	insertGelbooruStartingPoints()
-	insertR34xxxStartingPoints()
+	print("Inserting start URLs")
+
+	do_upsert("Danbooru", 2700000)
+	do_upsert('Gelbooru', 3600000)
+	do_upsert('Rule34.xxx', 2300000)
+
+	print("Resetting DL states.")
 	resetDlstate()
 
-
-	# r34xxxScrape.run(0)
-
+	print("Creating run contexts")
 	executor = concurrent.futures.ThreadPoolExecutor(max_workers=THREADS)
 	try:
 		# for x in range(2):
 		# executor.submit(danbooruFetch.run, 0)
 		# executor.submit(gelbooruFetch.run, 0)
-		for x in range(THREADS):
+		for x in range(THREADS//3):
 			executor.submit(r34xxxScrape.run, x)
-		# for x in range(THREADS//2):
-		# 	executor.submit(danbooruFetch.run, x)
-		# for x in range(THREADS//2):
-		# 	executor.submit(gelbooruFetch.run, x)
+		for x in range(THREADS//2):
+			executor.submit(danbooruFetch.run, x)
+		for x in range(THREADS//2):
+			executor.submit(gelbooruFetch.run, x)
+
+		print("Waiting for workers to complete.")
 		executor.shutdown()
 	except KeyboardInterrupt:
 		print("Waiting for executor.")
diff --git a/patch.py b/patch.py
index 9be98c3..90c52bf 100644
--- a/patch.py
+++ b/patch.py
@@ -59,6 +59,7 @@ def go():
 		proc += 1
 		if proc % 50 == 0:
 			db.session.commit()
+
 if __name__ == '__main__':
 	go()
 
diff --git a/r34xxxScrape.py b/r34xxxScrape.py
index 1c6971d..86aa023 100644
--- a/r34xxxScrape.py
+++ b/r34xxxScrape.py
@@ -87,7 +87,7 @@ class R34xxxFetcher(danbooruFetch.DanbooruFetcher):
 				cal = parsedatetime.Calendar()
 				val = val.split("by")[0]
 				tstruct, pstat = cal.parse(val)
-				assert pstat == 1 or pstat == 2
+				assert pstat == 1 or pstat == 2 or pstat == 3
 				job.posted = datetime.datetime.fromtimestamp(time.mktime(tstruct))
 			elif name == 'Size':
 				if not '\n' in val:
@@ -141,7 +141,7 @@ class R34xxxFetcher(danbooruFetch.DanbooruFetcher):
 		# print(fname)
 
 	def processJob(self, job):
-		pageurl = 'http://rule34.xxx/index.php?page=post&s=view&id={}'.format(job.postid)
+		pageurl = 'https://rule34.xxx/index.php?page=post&s=view&id={}'.format(job.postid)
 		try:
 			soup = self.wg.getSoup(pageurl)
 		except urllib.error.URLError:
@@ -174,6 +174,7 @@ class R34xxxFetcher(danbooruFetch.DanbooruFetcher):
 				break
 			except AssertionError:
 				self.log.info("Assertion error?: '%s'", pageurl)
+				traceback.print_exc()
 				job.dlstate=-50
 				db.session.rollback()
 				break
@@ -210,3 +211,14 @@ def run(indice):
 		print("Unhandled exception!")
 		traceback.print_exc()
 		raise
+
+
+
+if __name__ == '__main__':
+
+	import logSetup
+	logSetup.initLogging()
+
+	run(1)
+
+
diff --git a/settings.py b/settings.py
index c55d8b5..75de43c 100644
--- a/settings.py
+++ b/settings.py
@@ -4,7 +4,7 @@
 DATABASE_USER    = "dbarchiver"
 DATABASE_PASS    = "YEkTYt4sCcWctY"
 DATABASE_DB_NAME = "dbmirror"
-DATABASE_IP      = "10.1.1.8"
+DATABASE_IP      = "10.1.1.61"
 # Note that a local socket will be tried before the DATABASE_IP value, so if DATABASE_IP is
 # invalid, it may work anyways.
 
diff --git a/webFunctions.py b/webFunctions.py
index b8de4bb..e471a2d 100644
--- a/webFunctions.py
+++ b/webFunctions.py
@@ -201,7 +201,7 @@ class WebGetRobust:
 		if isinstance(page, bytes):
 			raise ValueError("Received content not decoded! Cannot parse!")
 
-		soup = bs4.BeautifulSoup(page)
+		soup = bs4.BeautifulSoup(page, "lxml")
 		return soup