discogarchive: fix for bandcamp stinky

This commit is contained in:
yosh 2024-03-24 14:47:31 -04:00
parent e91ff9788c
commit c0b92df7bb
1 changed files with 17 additions and 2 deletions

View File

@ -20,6 +20,16 @@ fail() {
exit 1
}
unescapehtml() {
sed '
s/"/"/g
s/'/'\''/g
s/&#60;/</g
s/&#62;/>/g
s/&#38;/\&/g
'
}
clean() {
trap 'exit' INT HUP QUIT TERM EXIT
[ -f "$HTML_FILE" ] && rm -f "$HTML_FILE"
@ -81,10 +91,11 @@ errecho 'Moving on!'
# HTML/JSON PARSING #
curl -L -s -o "$HTML_FILE" "$1"
tmplog="" look_closer="" uncontained_releases="" contained_releases=""
tmplog="" look_closer="" uncontained_releases="" contained_releases="" contained_releases_raw=""
json="$(mktemp -u)"
# albums and tracks
while read -r url; do
url="$BASEURL$url"
# if rg -q -F "$url" "$logdir/log" 2>/dev/null; then
# errecho "ALREADY CONTAINED $type: $url"
# tmplog="$tmplog$url CONTAINED$NL"
@ -98,6 +109,7 @@ while read -r url; do
errecho "LOOK CLOSER: $url"
elif [ "$(jq 'if (.inAlbum) then .inAlbum.albumRelease[0].offers.price else .albumRelease[0].offers.price end' < "$json")" = "0.0" ]; then
contained_releases="<a href=\"$url\" rel=\"nofollow\">$artist - $name</a><br>${NL}${contained_releases}"
contained_releases_raw="$url$NL$contained_releases_raw"
errecho "CONTAINED RELEASE: $artist - $name"
if ! rg -q -F "$url" "$logdir/log" 2>/dev/null; then
echo "$url" >> "$logdir/log"
@ -107,7 +119,10 @@ while read -r url; do
errecho "UNCONTAINED RELEASE: $artist - $name"
fi
done <<-EOF
$(rg -e '"(/(album|track)/.+)"' -or "$BASEURL"'$1' "$HTML_FILE")
$(
pup '#music-grid > li > a attr{href}' < "$HTML_FILE"
pup '#music-grid attr{data-client-items}' < "$HTML_FILE" | unescapehtml | jq -r '.[].page_url'
)
EOF
# formatting the description