From 9180519e45c4a74a02301ca5ed43a7474f753c31 Mon Sep 17 00:00:00 2001 From: yosh Date: Fri, 5 Apr 2024 18:12:38 -0400 Subject: [PATCH] discogarchive: yeah I redid the whole thing basically --- discogarchive | 337 ++++++++++++++++++++++++++++++++++---------------- 1 file changed, 228 insertions(+), 109 deletions(-) diff --git a/discogarchive b/discogarchive index 1da6c58..d49e34a 100755 --- a/discogarchive +++ b/discogarchive @@ -20,136 +20,255 @@ fail() { exit 1 } +# transform the escapes html entities that bandcamp has in that one thing +# into unescaped versions, so we can put it through jq unescapehtml() { - sed ' - s/"/"/g - s/'/'\''/g - s/<//g - s/&/\&/g - ' + sed 's/"/"/g ; s/'/'\''/g ; s/<//g ; s/&/\&/g' +} +# for release escaping +escapehtml() { + sed 's/&/\&/g ; s//\>/g ; s/"/\"/g ; s/'\''/\'/g' +} + +# inserts into a section at index after a url +# $1: section +# $2: index (0-indexed) +# $3: string +insert_into_section() { + index="$2" + case "$index" in + end) index_str='/^$' ;; + *) index_str="+$index" ;; + esac + >/dev/null ed DESCRIPTION.html <<-EOF + /
$section + $index_str + i + $3 + . + wq + EOF +} + +# gets all from a section +get_section() { + awk -v h="$1" -v RS="" -v FS='\n' '$1 ~ h' DESCRIPTION.html +} + +# checks if release link is contained or not +# returns num:line if it is +is_contained() { + get_section "Contained Releases" | grep -nF "href=\"$1\"" +} + +is_uncontained() { + get_section "Uncontained Releases" | grep -nF "href=\"$1\"" +} + +# move a link from uncontained to contained if it exists in uncontained +# uses $last_contained +# $1 = url +move_from_uncontained() { + if conline=$(is_contained "$last_contained"); then + conline="$(grep -nF "${conline#*:}" DESCRIPTION.html)" + else + conline="/
Contained Releases" + fi + >/dev/null ed DESCRIPTION.html <<-EOF + ${unline%%:*}m${conline%%:*} + wq + EOF } clean() { trap 'exit' INT HUP QUIT TERM EXIT - [ -f "$HTML_FILE" ] && rm -f "$HTML_FILE" - [ -f "$json" ] && rm -f "$json" + [ -f "${DISCOG_PAGE:-}" ] && rm -f "$DISCOG_PAGE" + [ -f "${RELEASE_PAGE:-}" ] && rm -f "$RELEASE_PAGE" + [ -f "${JSON_HEAD:-}" ] && rm -f "$JSON_HEAD" + [ -f "${JSON_TRALBUM:-}" ] && rm -f "$JSON_TRALBUM" exit } trap 'clean' INT HUP QUIT TERM EXIT # OPTIONS # -no_download="" -while getopts :n OPT; do +no_download="" only_download="" skip_contained="" force_desc="" +while getopts :nds OPT; do case $OPT in n) no_download=1 ;; # don't download files, just make the description.html + d) only_download=1 ;; # only download files, don't make description.html + s) skip_contained=1 ;; # skip known albums, will insert uncontained -> contained ones after "last contained album" *) fail "unknown option: -$OPTARG" ;; esac done shift "$((OPTIND - 1))" +# END OPTIONS # -case $1 in - *.bandcamp.com*) : ;; - *) fail 'Please use the *.bandcamp.com link instead of any custom domains!' ;; -esac -IFS="/" read -r _ _ BANDCAMP_DOMAIN _ <<-EOF - $1 +# SETUP # +# creates a NEWLINE-deliminated list of all bandcamp domains in question +BANDCAMP_DOMAINS=$(awk -v RS='/| ' '$1 ~ "bandcamp.com"' <<-EOF + $* EOF -mkdir -p "$BANDCAMP_DOMAIN"; cd "$BANDCAMP_DOMAIN" -logdir="${XDG_DATA_HOME:-$HOME/.local/share}/discogarchive" -mkdir -p "$logdir" +) +DISCOG_PAGE=$(mktemp) +RELEASE_PAGE=$(mktemp) +JSON_HEAD=$(mktemp) +JSON_TRALBUM=$(mktemp) +## END SETUP -HTML_FILE="/tmp/$BANDCAMP_DOMAIN.html" -BASEURL="https://$BANDCAMP_DOMAIN" +# MAIN LOOP # +for BANDCAMP_DOMAIN in $BANDCAMP_DOMAINS; do + cd ~/data/discographies # normalized path ig + is_existing_discog="" + if [ -d "$BANDCAMP_DOMAIN" ]; then + is_existing_discog=1 + fi + mkdir -p "$BANDCAMP_DOMAIN"; cd "$BANDCAMP_DOMAIN" -# SOCIALS # -errecho 'input artist socials in KEY=VALUE format. blank line to continue. bandcamp is already filled -common shorthands: -web=website, sc=soundcloud, yt=youtube, tw=twitch, tr=twitter, mx=mixcloud, -ig=instagram, sp=spotify, lt=Linktree, ch=Cohost' -socials="Bandcamp
$NL" -while IFS="=" read -r key val; do - case "$key" in - web) key=Website ;; - sc) key=SoundCloud ;; - yt) key=YouTube ;; - tw) key=Twitch ;; - tr) key=Twitter ;; - mx) key=Mixcloud ;; - ig) key=Instagram ;; - sp) key=Spotify ;; - lt) key=Linktree ;; - ch) key=Cohost ;; - "") break ;; - *) : ;; - esac - socials="$socials$key
$NL" -done -socials="$(printf '%s' "$socials" | sort)$NL" -errecho 'Moving on!' + BASEURL="https://$BANDCAMP_DOMAIN" -# HTML/JSON PARSING # -curl -L -s -o "$HTML_FILE" "$1" + # SOCIALS # + if [ -z "$only_download" ] && [ -z "$is_existing_discog" ] || ! [ -f DESCRIPTION.html ]; then + >&2 cat <<-EOF + input artist socials in KEY=VALUE format. blank line to continue. bandcamp is already filled + common shorthands: + web=website, sc=soundcloud, yt=youtube, tw=twitch, tr=twitter, mx=mixcloud, + ig=instagram, sp=spotify, lt=Linktree, ch=Cohost + EOF + # creates the DESCRIPTION thingy + printf 'Bandcamp
\n' "$BASEURL" > DESCRIPTION.html + while IFS="=" read -r key val; do + case "$key" in + web) key=Website ;; + sc) key=SoundCloud ;; + yt) key=YouTube ;; + tw) key=Twitch ;; + tr) key=Twitter ;; + mx) key=Mixcloud ;; + ig) key=Instagram ;; + sp) key=Spotify ;; + lt) key=Linktree ;; + ch) key=Cohost ;; + "") break ;; + *) : ;; + esac + printf '%s
\n' "$val" "$key" >> DESCRIPTION.html + done + errecho 'Moving on!' + printf '\n
%s
\n\n
%s
' \ + "Uncontained Releases" "Contained Releases" >> DESCRIPTION.html + fi + # END SOCIALS # -tmplog="" look_closer="" uncontained_releases="" contained_releases="" contained_releases_raw="" -json="$(mktemp -u)" -# albums and tracks -while read -r url; do - url="$BASEURL$url" -# if rg -q -F "$url" "$logdir/log" 2>/dev/null; then -# errecho "ALREADY CONTAINED $type: $url" -# tmplog="$tmplog$url CONTAINED$NL" -# continue -# fi - curl -L -s -o - "$url" | pup 'script[type="application/ld+json"]' 'text{}' > "$json" || continue - artist="$(jq -r '.byArtist.name' < "$json")" - name="$(jq -r '.name' < "$json")" - if [ "$(jq '.numTracks' < "$json")" = "0.0" ]; then - look_closer="$artist - $name
$NL$look_closer" - errecho "LOOK CLOSER: $url" - elif [ "$(jq 'if (.inAlbum) then .inAlbum.albumRelease[0].offers.price else .albumRelease[0].offers.price end' < "$json")" = "0.0" ]; then - contained_releases="$artist - $name
${NL}${contained_releases}" - contained_releases_raw="$url$NL$contained_releases_raw" - errecho "CONTAINED RELEASE: $artist - $name" - if ! rg -q -F "$url" "$logdir/log" 2>/dev/null; then - echo "$url" >> "$logdir/log" + # HTML/JSON PARSING # + curl -L -s -o "$DISCOG_PAGE" "$BASEURL/music" + + if [ -z "$only_download" ]; then + look_closer="" uncontained_releases="" contained_releases="" contained_releases_raw="" + last_contained="" last_uncontained="" + # albums and tracks + while read -r url; do + case "$url" in + /*) url="$BASEURL$url" ;; + *) url=${url%%\?*} ;; + esac + + # if is contained, skip it but also add its thingy to the contained releases var + if [ -n "$skip_contained" ] && is_contained "$url" >/dev/null; then + last_contained="$url" + errecho "ALREADY CONTAINED: $url" + continue + fi + + # get album data + curl -L -s -o "$RELEASE_PAGE" "$url" + pup 'head > script[type="application/ld+json"]' 'text{}' < "$RELEASE_PAGE" > "$JSON_HEAD" || continue + pup -p '[data-tralbum] attr{data-tralbum}' 'text{}' < "$RELEASE_PAGE" > "$JSON_TRALBUM" || continue + + artist="$(jq -r '.byArtist.name' < "$JSON_HEAD")" + name="$(jq -r '.name' < "$JSON_HEAD")" + numtracks="$(jq -r '.inAlbum.numTracks' < "$JSON_HEAD")" + [ "$numtracks" = "null" ] && numtracks="$(jq -r .numTracks < "$JSON_HEAD")" + price="$(jq ' + if (.inAlbum) then .inAlbum.albumRelease[0].offers.price + else .albumRelease[0].offers.price end' < "$JSON_HEAD" + )" + + # check if it even has audio first + if [ "$(jq .hasAudio < "$JSON_TRALBUM")" != "true" ]; then + errecho "NO AUDIO, LOOK CLOSER: $url" + fi + + # now check price :D + if [ "$price" = "0.0" ]; then + if unline=$(is_uncontained "$url"); then + unline="$(grep -nF "${unline#*:}" DESCRIPTION.html)" + move_from_uncontained "$url" + else + if indexline=$(is_contained "$last_contained"); then + indexline="$(grep -nF "${indexline#*:}" DESCRIPTION.html)" + else + indexline="/
Contained Releases" + fi + >/dev/null ed DESCRIPTION.html <<-EOF + ${indexline%%:*} + a + $artist - $name
+ . + wq + EOF + fi + errecho "CONTAINED RELEASE: $artist - $name" + last_contained="$url" + else + # just to catch if we are rebuilding something and don't skip + is_contained "$url" >/dev/null && continue + errecho "UNCONTAINED RELEASE: $artist - $name" + is_uncontained "$url" >/dev/null && continue + if indexline=$(is_contained "$last_uncontained"); then + indexline="$(grep -nF "${indexline#*:}" DESCRIPTION.html)" + else + indexline="/
Uncontained Releases" + fi + >/dev/null ed DESCRIPTION.html <<-EOF + ${indexline%%:*} + a + $artist - $name
+ . + wq + EOF + last_uncontained="$url" + fi + done <<-EOF + $( + pup '#music-grid > li > a attr{href}' < "$DISCOG_PAGE" + pup '#music-grid attr{data-client-items}' < "$DISCOG_PAGE" | unescapehtml | jq -r '.[].page_url' + ) + EOF + fi + + # download shit + if [ -z "$no_download" ]; then + outdir="./files" + if [ -n "$is_existing_discog" ]; then + outdir="./files/new_stuff" fi - else - uncontained_releases="$artist - $name
${NL}${uncontained_releases}" - errecho "UNCONTAINED RELEASE: $artist - $name" + mkdir -p "$outdir" + printf '%s' 'CAT.ALL' > ./files/_rules.conf + # artist img + img="$(pup 'img.band-photo' 'attr{src}' < "$DISCOG_PAGE")" + if [ -n "$img" ]; then + ext="${img##*.}" + curl -s -o ./files/artist."$ext" "${img%_*}_0.$ext" + fi + errecho "Done with retrieving metadata! Now to download..." + bcdl-free --no-unzip -z 12345 -f FLAC -d "$outdir" -e auto -l "$BASEURL/music" + + set +f + if [ "$(printf '%s' "$outdir"/*.flac)" != "$outdir/*.flac" ]; then + mkdir "$outdir"/TRACKS + mv "$outdir"/*.flac "$outdir"/TRACKS + fi + set -f fi -done <<-EOF - $( - pup '#music-grid > li > a attr{href}' < "$HTML_FILE" - pup '#music-grid attr{data-client-items}' < "$HTML_FILE" | unescapehtml | jq -r '.[].page_url' - ) -EOF - -# formatting the description -if [ -n "${uncontained_releases}" ]; then - uncontained_releases="$NL
Uncontained Releases
$NL$uncontained_releases" -fi -contained_releases="$NL
Contained Releases
$NL$contained_releases" - -printf '%s%s%s%s' \ - "${look_closer:+$look_closer$NL}" "$socials" "$uncontained_releases" "$contained_releases" > DESCRIPTION.html - -# download shit -if [ ! "$no_download" ]; then - mkdir -p "files" - printf '%s' 'CAT.ALL' > ./files/_rules.conf - # artist img - img="$(pup 'img.band-photo' 'attr{src}' < "$HTML_FILE")" - if [ -n "$img" ]; then - ext="${img##*.}" - curl -s -o ./files/artist."$ext" "${img%_*}_0.$ext" - fi - errecho "Done with retrieving metadata! Now to download..." - bcdl-free --no-unzip -z 12345 -f FLAC -d "./files" -e auto -l "$BASEURL" - - set +f - if [ ./files/*.flac != "./files/*.flac" ]; then - mkdir ./files/TRACKS - mv ./files/*.flac ./files/TRACKS - fi -fi +done +# END MAIN LOOP #