misc-scripts/discogarchive

#!/bin/sh
set -euf

# TODO: maybe make it so we're in an "updating" or "not" state?
# would have to keep tabs with an existing description file and go from there
# for now we don't gotta worry about it too much

BN="${0##*/}"
NL='
'
OSTYPE=linux-gnu . "$HOME/.local/python-venv/bin/activate"

# HELPER FUNCTIONS #
errecho() {
	>&2 echo "$*"
}

fail() {
	errecho "error: $BN: $*"
	exit 1
}

# transform the escapes html entities that bandcamp has in that one thing
# into unescaped versions, so we can put it through jq
unescapehtml() {
	sed 's/&#34;/"/g ; s/&#39;/'\''/g ; s/&#60;/</g ; s/&#62;/>/g ; s/&#38;/\&/g'
}
# for release escaping
escapehtml() {
	sed 's/&/\&amp;/g ; s/</\&lt;/g ; s/>/\&gt;/g ; s/"/\&quot;/g ; s/'\''/\&#39/g'
}

# inserts into a section at index after a url
# $1: section
# $2: index (0-indexed)
# $3: string
insert_into_section() {
	index="$2"
	case "$index" in
		end) index_str='/^$' ;;
		*) index_str="+$index" ;;
	esac
	>/dev/null ed DESCRIPTION.html <<-EOF
		/<br><div><b>$section
		$index_str
		i
		$3
		.
		wq
	EOF
}

# gets all from a section
get_section() {
	awk -v h="$1" -v RS="" -v FS='\n' '$1 ~ h' DESCRIPTION.html
}

# checks if release link is contained or not
# returns num:line if it is
is_contained() {
	get_section "Contained Releases" | grep -nF "href=\"$1\""
}

is_uncontained() {
	get_section "Uncontained Releases" | grep -nF "href=\"$1\""
}

# move a link from uncontained to contained if it exists in uncontained
# uses $last_contained
# $1 = url
move_from_uncontained() {
	if conline=$(is_contained "$last_contained"); then
		conline="$(grep -nF "${conline#*:}" DESCRIPTION.html)"
	else
		conline="/<br><div><b>Contained Releases"
	fi
	>/dev/null ed DESCRIPTION.html <<-EOF
		${unline%%:*}m${conline%%:*}
		wq
	EOF
}

clean() {
	trap 'exit' INT HUP QUIT TERM EXIT
	[ -f "${DISCOG_PAGE:-}" ] && rm -f "$DISCOG_PAGE"
	[ -f "${RELEASE_PAGE:-}" ] && rm -f "$RELEASE_PAGE"
	[ -f "${JSON_HEAD:-}" ] && rm -f "$JSON_HEAD"
	[ -f "${JSON_TRALBUM:-}" ] && rm -f "$JSON_TRALBUM"
	exit
}
trap 'clean' INT HUP QUIT TERM EXIT

# OPTIONS #
no_download="" only_download="" skip_contained="" force_desc=""
while getopts :nds OPT; do
	case $OPT in
		n) no_download=1 ;; # don't download files, just make the description.html
		d) only_download=1 ;; # only download files, don't make description.html
		s) skip_contained=1 ;; # skip known albums, will insert uncontained -> contained ones after "last contained album"
		*) fail "unknown option: -$OPTARG" ;;
	esac
done
shift "$((OPTIND - 1))"
# END OPTIONS #

# SETUP #
# creates a NEWLINE-deliminated list of all bandcamp domains in question
BANDCAMP_DOMAINS=$(awk -v RS='/| ' '$1 ~ "bandcamp.com"' <<-EOF
	$*
EOF
)
DISCOG_PAGE=$(mktemp)
RELEASE_PAGE=$(mktemp)
JSON_HEAD=$(mktemp)
JSON_TRALBUM=$(mktemp)
## END SETUP

# MAIN LOOP #
for BANDCAMP_DOMAIN in $BANDCAMP_DOMAINS; do
	cd ~/data/discographies # normalized path ig
	is_existing_discog=""
	if [ -d "$BANDCAMP_DOMAIN" ]; then
		is_existing_discog=1
	fi
	mkdir -p "$BANDCAMP_DOMAIN"; cd "$BANDCAMP_DOMAIN"

	BASEURL="https://$BANDCAMP_DOMAIN"

	# SOCIALS #
	if [ -z "$only_download" ] && [ -z "$is_existing_discog" ] || ! [ -f DESCRIPTION.html ]; then
		>&2 cat <<-EOF
		input artist socials in KEY=VALUE format. blank line to continue. bandcamp is already filled
		common shorthands:
		web=website, sc=soundcloud, yt=youtube, tw=twitch, tr=twitter, mx=mixcloud,
		ig=instagram, sp=spotify, lt=Linktree, ch=Cohost
		EOF
		# creates the DESCRIPTION thingy
		printf '<a href="%s" rel="nofollow">Bandcamp</a><br>\n' "$BASEURL" > DESCRIPTION.html
		while IFS="=" read -r key val; do
			case "$key" in
				web) key=Website ;;
				sc) key=SoundCloud ;;
				yt) key=YouTube ;;
				tw) key=Twitch ;;
				tr) key=Twitter ;;
				mx) key=Mixcloud ;;
				ig) key=Instagram ;;
				sp) key=Spotify ;;
				lt) key=Linktree ;;
				ch) key=Cohost ;;
				"") break ;;
				*) : ;;
			esac
			printf '<a href="%s" rel="nofollow">%s</a><br>\n' "$val" "$key" >> DESCRIPTION.html
		done
		errecho 'Moving on!'
		printf '\n<br><div><b>%s</b></div>\n\n<br><div><b>%s</b></div>' \
			"Uncontained Releases" "Contained Releases" >> DESCRIPTION.html
	fi
	# END SOCIALS #

	# HTML/JSON PARSING #
	curl -L -s -o "$DISCOG_PAGE" "$BASEURL/music"

	if [ -z "$only_download" ]; then
		look_closer="" uncontained_releases="" contained_releases="" contained_releases_raw=""
		last_contained="" last_uncontained=""
		# albums and tracks
		while read -r url; do
			case "$url" in
				/*) url="$BASEURL$url" ;;
				*) url=${url%%\?*} ;;
			esac

			# if is contained, skip it but also add its thingy to the contained releases var
			if [ -n "$skip_contained" ] && is_contained "$url" >/dev/null; then
				last_contained="$url"
				errecho "ALREADY CONTAINED: $url"
				continue
			fi

			# get album data
			curl -L -s -o "$RELEASE_PAGE" "$url"
			pup 'head > script[type="application/ld+json"]' 'text{}' < "$RELEASE_PAGE" > "$JSON_HEAD" || continue
			pup -p '[data-tralbum] attr{data-tralbum}' < "$RELEASE_PAGE" > "$JSON_TRALBUM" || continue

			artist="$(jq -r '.byArtist.name' < "$JSON_HEAD")"
			name="$(jq -r '.name' < "$JSON_HEAD")"
			numtracks="$(jq -r '.inAlbum.numTracks' < "$JSON_HEAD")"
			[ "$numtracks" = "null" ] && numtracks="$(jq -r .numTracks < "$JSON_HEAD")"
			price="$(jq '
				if (.inAlbum) then .inAlbum.albumRelease[0].offers.price
				else .albumRelease[0].offers.price end' < "$JSON_HEAD"
				)"

			# check if it even has audio first
			if [ "$(jq .hasAudio < "$JSON_TRALBUM")" != "true" ]; then
				errecho "NO AUDIO, LOOK CLOSER: $url"
				continue
			fi

			# now check price :D
			if [ "$price" = "0.0" ]; then
				if unline=$(is_uncontained "$url"); then
					unline="$(grep -nF "${unline#*:}" DESCRIPTION.html)"
					move_from_uncontained "$url"
				else
					if indexline=$(is_contained "$last_contained"); then
						indexline="$(grep -nF "${indexline#*:}" DESCRIPTION.html)"
					else
						indexline="/<br><div><b>Contained Releases"
					fi
					>/dev/null ed DESCRIPTION.html <<-EOF
						${indexline%%:*}
						a
						<a href="$url" rel="nofollow">$artist - $name</a><br>
						.
						wq
					EOF
				fi
				errecho "CONTAINED RELEASE: $artist - $name"
				last_contained="$url"
			else
				# just to catch if we are rebuilding something and don't skip
				is_contained "$url" >/dev/null && continue
				errecho "UNCONTAINED RELEASE: $artist - $name"
				is_uncontained "$url" >/dev/null && continue
				if indexline=$(is_uncontained "$last_uncontained"); then
					indexline="$(grep -nF "${indexline#*:}" DESCRIPTION.html)"
				else
					indexline="/<br><div><b>Uncontained Releases"
				fi
				>/dev/null ed DESCRIPTION.html <<-EOF
					${indexline%%:*}
					a
					<a href="$url" rel="nofollow">$artist - $name</a><br>
					.
					wq
				EOF
				last_uncontained="$url"
			fi
		done <<-EOF
			$(
				pup '#music-grid > li > a attr{href}' < "$DISCOG_PAGE"
				pup '#music-grid attr{data-client-items}' < "$DISCOG_PAGE" | unescapehtml | jq -r '.[].page_url'
			)
		EOF
	fi

	# download shit
	if [ -z "$no_download" ]; then
		outdir="./files"
		if [ -n "$is_existing_discog" ]; then
			outdir="./files/new_stuff"
		fi
		mkdir -p "$outdir"
		printf '%s' 'CAT.ALL' > ./files/_rules.conf
		# artist img
		img="$(pup 'img.band-photo' 'attr{src}' < "$DISCOG_PAGE")"
		if [ -n "$img" ]; then
			ext="${img##*.}"
			curl -s -o ./files/artist."$ext" "${img%_*}_0.$ext"
		fi
		errecho "Done with retrieving metadata! Now to download..."
		bcdl-free --no-unzip -z 12345 -f FLAC -d "$outdir" -e auto -l "$BASEURL/music"

		set +f
		if [ "$(printf '%s' "$outdir"/*.flac)" != "$outdir/*.flac" ]; then
			mkdir "$outdir"/TRACKS
			mv "$outdir"/*.flac "$outdir"/TRACKS
		fi
		set -f
	fi
done
# END MAIN LOOP #