shell-scripts/arcfunc.sh at main · BanementI/shell-scripts · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
#!/bin/sh
##################################################################
# ARCFUNC - PUBLIC - NOT POSIX COMPLIANT
#
# Scripts to help maintain my archives. Some were mostly AI-assisted, they have been noted.
#
# REQUIREMENTS: yt-dlp, gallery-dl, ripgrep
#
##################################################################

ERROR='\033[0;31mERROR:\033[0m'
WARN='\033[1;33mWARN:\033[0m'
NOTE='\033[0;36mNOTE:\033[0m'

arcfunc() {
    printf "arcfunc by banement\n"
    printf "arc | etc | help\n"

    if [ "$1" = "arc" ]; then
        printf "$WARN checkdel - Compares live Twitter metadata to the images we have downloaded, and sorts deleted images.\n"
        printf "cdxback <website> - Makes a CDX from inputted site.\n"
        printf "cdxmake <website> - Generates a list of URLs from a CDX. Use after cdxback.\n"
        printf "$WARN da-search <username> - Searches a LOCAL CDX DB and grabs its IA URLs.\n"
        printf "dvconv <file> - Converts a minidv file to MP4.\n"
        printf "idgen - Generates a list of video IDs as output.txt.\n"
        printf "pixivcheck - Checks if pixiv images are still online or not.\n"
        printf "pixivnum - Displays the detected amount of pixiv images.\n"
        printf "quick-warc - WARC a website.\n"
        printf "twtarc-neo - The improved and fixed version of twtarc. Attempts to recover images from deleted twitter accounts via IA API.\n"
        # printf "twthelp - Shows the other twtarc commands.\n" outdated
        printf "unlistedhunter - Goes through the current folder of downloaded YouTube videos, and verifies their status online, with priority over unlisted videos. May require cookies.\n"
		printf "videocheck - Like unlistedhunter, but gives better details as to what videos are private, deleted, etc.\n"
        printf "$NOTE wixmp-search <uuid> - Searches IA API for URLs.\n"
        printf "zipback - Get all archives (and other files) from a specified wayback website.\n"
    elif [ "$1" = "etc" ]; then
        printf "apk-size - On Alpine, gives a list of all the packages and their sizes.\n"
    elif [ "$1" = "help" ]; then
        printf "yellow = NAS-specific setup | blue = read the README\n"
    fi
}

# Scans the current folder for pixiv images and checks if they are still online.
pixivcheck() {
    pixCount=$(find . -maxdepth 1 -type f -print | sed -n 's#./\([0-9]\+\)_p.*#\1#p' | wc -l)
    echo "There are $pixCount matching files."
    pixBase="https://www.pixiv.net/en/artworks/"
    find . -maxdepth 1 -type f -print | while read -r file; do
        pixID=$(echo "$file" | sed -n 's#./\([0-9]\+\)_p.*#\1#p')

        if [ -n "$pixID" ]; then
            #echo "Checking PixID: $pixID"

            # Debugging: Print the full URL being checked
            pixURL="$pixBase$pixID"
            #echo "URL: $pixURL"

            # Use curl to follow redirects, capture final status code
            pixStat=$(curl -s -L -o /dev/null -w "%{http_code}" "$pixURL")

            # Debugging: Show the final response status
            #echo "Response Code for $url: $pixStat"

            if [ "$pixStat" = "404" ]; then
                echo "$pixID is lost media!"
            elif [ "$pixStat" = "200" ]; then
                :
            else
                echo "Unknown!"
                echo "Status Code: $pixStat"
            fi

            sleep 2
        fi
    done
}

pixivnum() {
    find . -maxdepth 1 -type f -print | sed -n 's#./\([0-9]\+\)_p.*#\1#p'
}

idgen() {
  (find . -regex '.*\.\(mkv\|mp4\|webm\)' | sed -En '/.*-[A-Za-z0-9_-]{11}\.[^.]+$/ s/.*-([A-Za-z0-9_-]{11})\.[^.]+$/\1/p'; \
  find . -regex '.*\.\(mkv\|mp4\|webm\)' | grep '\[[^]]\{11\}\]' | sed -E 's/.*\[([^]]{11})\].*/\1/' | uniq) >> output.txt
}

# Checks your
unlistedhunter() {
    # Video list file
    videoList="localIDs.txt"
    cCount=0
    aCount=0
    pCount=0
    tCount=0
    pubCount=0
    unCount=0
    dCount=0

    stats() {
        printf "%s\nunlistedhunter stats"
        echo "C A P T . U D"
        echo "$cCount $aCount $pCount $tCount $pubCount $unCount $dCount"
    }

   if [ "$1" = "old" ]; then
        # Extract video IDs from filenames in the old format
        find . -maxdepth 1 -regex '.*\.\(mkv\|mp4\|webm\)' | sed -E 's/.*-([A-Za-z0-9_-]{11})\.[^.]+$/\1/' > "$videoList"
   else
        # Extract video IDs from filenames with square brackets
        find . -maxdepth 1 -regex '.*\.\(mkv\|mp4\|webm\)' | sed -E 's/.*\[([^]]{11})\].*/\1/' > "$videoList"
   fi

    # Base URL for YouTube videos
    baseUrl="https://www.youtube.com/watch?v="
    echo "unlistedhunter: C = Copyright, A = 18+, P = Private, T = Terminated, . = Public, D = Deleted"

    # Loop through each line in the video list
    while read -r videoId; do
        # Skip empty lines
        if [ -z "$videoId" ]; then
            echo "Skipping empty line."
            continue
        fi

        # Construct the full video URL
        videoUrl="${baseUrl}${videoId}"

        # Run yt-dlp to simulate fetching video info (without actually downloading it)
        output=$(yt-dlp --simulate --print-json "$videoUrl" 2>&1) # Add --cookies <loc> or --cookies-from-browser

        # Check for specific strings in the output
        if echo "$output" | grep -q "copyright claim"; then # Copyright claimed
            printf "C"
            cCount=$((cCount + 1))
        elif echo "$output" | grep -q "inappropriate"; then # 18+
            printf "A"
            aCount=$((aCount + 1))
        elif echo "$output" | grep -q "private"; then # Private video
            printf "P"
            pCount=$((pCount + 1))
        elif echo "$output" | grep -q "terminated"; then # Terminated YT account
            printf "T"
            tCount=$((tCount + 1))
        elif echo "$output" | grep -q "Video unavailable"; then
            printf "D"
            printf "\nDELETED: %s\n" "$videoUrl"
            dCount=$((dCount + 1))
        else
            if echo "$output" | grep -q "unlisted"; then # What we want
				printf "\nUNLISTED: %s\n" "$videoUrl"
				echo "$output" | jq '.title'
                unCount=$((unCount + 1))
            else # Public videos
                printf "."
                pubCount=$((pubCount + 1))
            fi
        fi
    done < "$videoList"

    stats

}

twtcdx() {

    if [ -z "$1" ]; then
        echo "Gets the cdx file for a Twitter user."
        echo "Usage: twtcdx <username>"
    return 1
    fi

    curl "https://web.archive.org/cdx/search/cdx?url=twitter.com/$1/status/&matchType=prefix&output=json&filter=statuscode:200" > "cdx_$1.json"
    echo "Created cdx_$1.json"
}

twtquery() {

    if [ -z "$1" ]; then
        echo "Check if there's any archives of a twitter user."
        echo "Usage: twtquery <username>"
    return 1
    fi

    response=$(curl -s "https://web.archive.org/cdx/search/cdx?url=twitter.com/$1/status/&matchType=prefix&output=json&filter=statuscode:200")
    if [ "$response" = "[]" ]; then
        echo "Nothing found!"
    else
        echo "Valid!"
    fi
}

twtmake() {

    if [ -z "$1" ]; then
        echo "Print all the image URLs"
        echo "Usage: twtmake <username>"
    return 1
    fi

    json_file="cdx_$1.json"

    jq -r '.[] | "\([.[1], .[2]])"' "$json_file" | while IFS=',' read -r timestamp original; do
        # Clean up quotes from extracted values
        timestamp=$(echo "$timestamp" | tr -d '"' | tr -d '[')
        original=$(echo "$original" | tr -d '"' | tr -d ']')

        # Now you can use timestamp and original variables for your command
        #echo "Timestamp: $timestamp"
        #echo "Original URL: $original"
        formURL="https://web.archive.org/web/$timestamp/$original"

        if [ "$2" = "img" ]; then
            imageURL="$(curl -s "$formURL" | grep 'meta property="og:image" content="' | grep -o 'https://web.archive.org/web/'"$timestamp"'im_/https://pbs.twimg.com/media/[A-Za-z0-9-]\+\.[a-z]\+')"
            echo "$imageURL"
        else
            echo "$formURL"
        fi
    done
}

twthelp() { # outdated
    echo "twtarc - Uses archive.org to attempt to recover images from a deleted account."
    echo "twtcdx - Gets the CDX file from API."
    echo "twtquery - Check if a user was even archived."
    echo "twtmake - Print all URLs of a certain type."
}

# NOTE: most of this was AI generated but ive made a lot of edits to it now.
zipback() {

    if [ -z "$1" ]; then
        echo "Get all archives from a wayback website, no dupes."
        echo "Usage: zipback <site> <extensions>"
        echo "-h for suggested formats."
        return 1
    elif [ "$1" = "-h" ]; then
        echo "Old sites: jpg jpeg zip avi mpg mpeg wmv"
        return 1
    fi

    BASE_URL="$1"
    shift
    EXTENSIONS=("$@")  # All remaining arguments are extensions

    # if we dont have the cdx already, get it
    if ! test -f cdx_$BASE_URL.json; then
        echo "Fetching list of files from archive.org for $BASE_URL..."
        curl -s "https://web.archive.org/cdx/search/cdx?url=${BASE_URL}&matchType=prefix&output=json&collapse=urlkey&filter=statuscode:200" > cdx_$BASE_URL.json
    else
        echo "cdx found"
    fi

    # we're timing out when triyng to get the cdx
    if grep -q "<html" cdx_$BASE_URL.json; then
        echo -e "$ERROR cdx file invalid! IA is having issues. Try later."
        grep "<html" cdx_$BASE_URL.json
        rm cdx_$BASE_URL.json # delete it, its no good
        return 1
    fi

    # === BUILD jq FILTER ===
    jq_filter=""
    for ext in "${EXTENSIONS[@]}"; do
        jq_filter+="endswith(\".${ext}\") or "
    done
    jq_filter="${jq_filter::-4}"  # Remove trailing " or "

    # === PARSE AND DOWNLOAD FILES ===
    echo "Filtering for extensions: ${EXTENSIONS[*]}"
    echo "Downloading to zipback/"
    touch ./zipback/temp_urlList.txt
    touch ./zipback/temp_urlList.txt

    # da loop
    jq -r '.[] | select(.[2] | '"$jq_filter"') | .[1] + " " + "https://web.archive.org/web/" + .[1] + "/" + .[2]' cdx_"$BASE_URL".json | tail -n +2 | while read -r timestamp url; do
    filename=$(basename "$url")
    if [ -f "./zipback/$filename" ]; then
        echo "NOTE: Exists, skipping $filename"
        continue
    fi

    # mostly for debugging
    time=$(date +%H:%M:%S)

    # if we hit IA too hard they block us temporarily
    status=$(wget --spider --timeout=10 --server-response "$url" 2>&1) # | awk '/^  HTTP/{print $2}'
    if echo "$status" | grep -q "Connection refused"; then
        echo -e "[$time] $ERROR Can't connect to $url, we're probably blocked."
        echo -e "Calling timeout-test [DEBUG]"
        #timeout-test
        echo -e "$WARN Taking a long nap (2 minutes)."
        sleep 160 # 2 = minutes, 300 = 5

        # try again after sleep
        status=$(wget --spider --timeout=10 --server-response "$url" 2>&1)
        # still blocked? just cancel it.
        if echo "$status" | grep -q "Connection refused"; then
            echo -e "$ERROR Still blocked. Exiting."
            return 1
        fi

    # Check if it's a 404 (broken link)
    elif echo "$status" | grep -q "404"; then
        echo -e "$WARN $filename Not found!! Skipping."
    # If it's OK, move on
    elif echo "$status" | grep -q "200"; then

        # download valley (the good one)
        echo "[$time] Downloading: $filename "
        echo "$url" >> ./zipback/temp_urlList.txt
        echo "$status" >> ./zipback/temp_urlList.txt
        wget --timeout=10 --tries=2 --waitretry=0 --random-wait --retry-connrefused -q --show-progress --directory-prefix="./zipback" "$url"
        requests=$(expr $requests + 1)

        # adding timestmaps (if you're a nerd and using WSL)
        # touchTime=$(date -d "${timestamp:0:8} ${timestamp:8:2}:${timestamp:10:2}:${timestamp:12:2}" +"%Y%m%d%H%M.%S")
        # touch -t "$touchTime" "$filename"

        # dont get too caught up on certain files
        if [ $? -ne 0 ]; then
            echo -e "[$time] $WARN Shits too slow!! Skipping."
            echo "$url" >> ./zipback/err_urlList.txt
        else
            echo "$url" >> ./zipback/urlList.txt
        fi

        # check if its not somehow HTML cus IA be quirky like that
        verify=$(file ./zipback/$filename)
        if echo "$verify" | grep -q "HTML"; then
            echo -e "[$time] $ERROR $filename is HTML. Deleting."
            echo "$url" >> ./zipback/err_urlList.txt
            rm ./zipback/$filename
        fi
        sleep 3 # frail child needs frequent breaks

        if [ "$requests" -ge 8 ]; then
            echo -e "[$time] $NOTE Resting..." # honk snoo
            sleep 30
            requests=0
        fi

    # failsafe
    else
        echo -e "[$time] $ERROR Uhhh, unexpected code? $status"
        return 1
    fi

    done

rm ./zipback/temp_urlList.txt
}

cdxback() {

    if [ -z "$1" ]; then
        echo "Gets the cdx file for a given page."
        echo "Usage: cdxback <site>"
    return 1
    fi

    curl -s "https://web.archive.org/cdx/search/cdx?url=$1&matchType=prefix&output=json&collapse=urlkey&filter=statuscode:200" > "$1.cdx"
    echo "Created $1.cdx"
}

cdxmake() {

     if [ -z "$1" ]; then
        echo "Generates a list of URLs from a CDX. Use after cdxback."
        echo "Usage: cdxmake <site>"
    return 1
    fi

    BASE_URL="$1"
    # da loop
    jq -r '.[] | select(.[2]) | .[1] + " " + "https://web.archive.org/web/" + .[1] + "/" + .[2]' cdx_"$BASE_URL".json | tail -n +2 | while read -r timestamp url; do
    filename=$(basename "$url")
    echo "$url" >> $BASE_URL\_list.txt
    done
}

timeout-test() {
    echo -e "Repeatedly sending reqs every 5 seconds until it lets us in"

    while true; do
    # if we hit IA too hard they block us temporarily
        time=$(date +%H:%M:%S)
        echo -e "[$time] $NOTE Checking..."
        status=$(wget --spider --timeout=10 --server-response "https://web.archive.org" 2>&1)
        if echo "$status" | grep -q "Connection refused"; then
            echo -e "[$time] $ERROR Still blocked, trying again."
            sleep 5
        else
            echo -e "[$time] Unblocked!"
            return 1
        fi
    done
}

da-search() {
    search=$1
    #shift
    # "$@"
    rg "$search" /sym/Root/Backups/DeviantArtDB | while IFS= read -r line; do
    # Extract the quoted strings manually using POSIX tools
    filename=$(echo "$line" | cut -d: -f1)
    timestamp=$(echo "$line" | tr '"' '\n' | awk 'NR==4 {print $1}')
    original=$(echo "$line" | tr '"' '\n' | awk 'NR==6 {print $1}')

    echo "Found @ $filename/$timestamp"
    # Create the Web Archive URL
    archive_url="https://web.archive.org/web/${timestamp}im_/$original"

    # Print the archive URL
    echo $archive_url >> $search.txt

    # Use curl to fetch the URL
    #curl -O "$archive_url"
    done
    echo "Sent to $search.txt"
}

da-search-man() {

    if [ $1 == "list" ]; then
      ls /sym/Root/Backups/DeviantArtDB/CDX
      return 1
    fi

    search=$2
    #shift
    # "$@"
    rg "$search" /sym/Root/Backups/DeviantArtDB/CDX/$1 | while IFS= read -r line; do
    # Extract the quoted strings manually using POSIX tools
    filename=$(echo "$line" | cut -d: -f1)
    timestamp=$(echo "$line" | tr '"' '\n' | awk 'NR==4 {print $1}')
    original=$(echo "$line" | tr '"' '\n' | awk 'NR==6 {print $1}')

    echo "Found @ $filename/$timestamp"
    # Create the Web Archive URL
    archive_url="https://web.archive.org/web/${timestamp}im_/$original"

    # Print the archive URL (optional, for debugging)
    echo $archive_url >> $search.txt

    # Use curl to fetch the URL (you can add options like -O to save the file)
    #curl -O "$archive_url"
    done
    echo "Sent to $search.txt"
}

wixmp-search() {
    uuid="$1"

    if [ -z "$uuid" ]; then
        echo "Usage: $0 <artist_uuid>"
        echo "-h for help"
        return 1
    elif [ $uuid == "-h" ]; then
        echo "how2find UUID: Find image from a user, direct link, string after /f/."
        return 1
    fi

    # List of base URL paths
    paths=(
    "intermediary/f"
    "f"
    )

    for path in "${paths[@]}"; do
    url="https://web.archive.org/cdx/search/cdx?url=images-wixmp-ed30a86b8c4ca887773594c2.wixmp.com/${path}/$uuid/&matchType=prefix&filter=mimetype:image/*&output=json&collapse=urlkey"
    echo "Checking: $path"

    response=$(curl -s "$url")

    if [[ -n "$response" && "$response" != "[]" ]]; then

        timestamp=$(echo "$response" | jq -r '.[1][1]')
        original=$(echo "$response" | jq -r '.[1][2]')

        # Create the Web Archive URL
        echo "Found $timestamp"
        archive_url="https://web.archive.org/web/$timestamp/$original"

        # Print the archive URL
        echo $archive_url >> $uuid.txt
    else
        echo "Nothing for $path"
    fi
    done
}

tumbldryer() { # unfinished

inputList="${1}_list.txt"
tempFile="temp.html"

while IFS= read -r pageUrl; do

    status=$(wget --spider --timeout=10 --server-response "$pageUrl" 2>&1)

    if echo "$status" | grep -q "Connection refused"; then
        echo -e "[$(date)] ERROR Can't connect to $pageUrl, we're probably blocked."
        echo -e "Calling timeout-test [DEBUG]"
        # timeout-test

        echo -e "WARN Taking a long nap (2 minutes)."
        sleep 160 # Sleep for 2 minutes

        # Try again after sleeping
        status=$(wget --spider --timeout=10 --server-response "$pageUrl" 2>&1)

        if echo "$status" | grep -q "Connection refused"; then
            echo -e "ERROR Still blocked after retry. Exiting."
            return 1
        fi

    fi

    if echo "$status" | grep -q "404"; then
        echo -e "WARN $pageUrl Not found!! Skipping."

    elif echo "$status" | grep -q "200"; then
        echo -e "\nDownloading: $pageUrl"
        curl -s "$pageUrl" -o "$tempFile"
        grep -oE 'http://[0-9]{1,3}\.media\.tumblr\.com[^ "]*' "$tempFile" | sort -u >> "${1}_found.txt"

    else
        echo -e "[$time] $ERROR Uhhh, unexpected code? $status"
        return 1
    fi

done < "$inputList"

# Remove duplicates from the output list
sort -u "$outputList" -o "$outputList"


}

# yes yes I KNOW I KNOW ITS AI GENERATED I JUST
# WANTED A SOLUTION ITS SO HOT IN MY ROOM RN
# ILL REWRITE THIS LATER IF YOU REALLY WANT
checkdel() {
    # Check arguments
    if [ "$#" -lt 1 ] || [ "$#" -gt 2 ]; then
        echo "Usage: checkdel <artist> [live]"
	echo "live only supports twitter right now"
        return 1
    elif [ -n "$1" ]; then
	imagesDir=$1
    	metadataDir="$imagesDir/metadata"
    	deletedDir="$imagesDir/deleted"
    fi

    if [ "$2" = "live" ]; then
        echo "live check"
	metadataDir="$imagesDir/integrityTemp"
	# echo $metadataDir

   gallery-dl -d . --postprocessor twitter-integrity --no-download -o skip=false --download-archive /dev/null "https://x.com/$1" > /dev/null

#	if [[ "$1" == *x.com* ]]; then
#	   gallery-dl -d . --postprocessor twitter-integrity --no-download -o skip=false --download-archive /dev/null "https://x.com/$1" > /dev/null
#	elif [[ "$1" == *pixiv.net* ]]; then
#	   gallery-dl -d . --postprocessor pixiv-integrity --no-download -o skip=false --download-archive /dev/null "$1" > /dev/null
#	fi
    fi

    # Ensure metadata directory exists
    if [ ! -d "$metadataDir" ]; then
        echo "Metadata directory not found at: $metadataDir"
        return 1
    fi

    # Create deleted directory if it doesn't exist
    mkdir -p "$deletedDir"

    # echo "using $metadataDir"
    # Extract 19-digit Twitter post IDs from filenames
    tmpImageIds=$(mktemp)
    tmpMetaIds=$(mktemp)

    find "$imagesDir" -type f | grep -v "$metadataDir/" | grep -v "$deletedDir/" | sed -n 's#.*/\([0-9]\{19\}\)_.*#\1#p' | sort -u > "$tmpImageIds"
    find "$metadataDir" -type f | sed -n 's#.*/\([0-9]\{19\}\)_.*#\1#p' | sort -u > "$tmpMetaIds"

    # Find IDs with images but no metadata
    missingMetaIds=$(comm -23 "$tmpImageIds" "$tmpMetaIds")

    # Move matching image files to "deleted" folder
    echo "Moving image files with no corresponding metadata to: $deletedDir"

    echo "$missingMetaIds" | while read postId; do
        echo "$postId"
        find "$imagesDir" -type f | grep "/${postId}_" | grep -v "$metadataDir/" | grep -v "$deletedDir/" | while read file; do
            mv "$file" "$deletedDir/"
        done
    done

    # Cleanup
    rm -f "$tmpImageIds" "$tmpMetaIds"

    if [ "$2" = "live" ]; then
	rm -rf "$imagesDir/integrityTemp"
    fi

}

twtarc-neo() {
    if [ -z "$1" ]; then
        echo "Recovered archived images from a specified Twitter profile."
        echo "Usage: twtarc <username>"
        return 1
    fi

    mkdir -p "$1"

    # Grab the CDX.json
    curl -s "https://web.archive.org/cdx/search/cdx?url=twitter.com/$1/status/&matchType=prefix&output=json" > "cdx_$1.json"

    json_file="cdx_$1.json"

    # Loop through each entry in the JSON array
    jq -r '.[] | "\([.[1], .[2]])"' "$json_file" | while IFS=',' read -r timestamp original; do
        timestamp=$(echo "$timestamp" | tr -d '"' | tr -d '[')
        original=$(echo "$original" | tr -d '"' | tr -d ']')

        formURL="https://web.archive.org/web/${timestamp}if_/${original}"

        # Grab all tweet-image <img> tags
        imageURLs=$(curl -s "$formURL" | grep -o 'https://web.archive.org/web/'"$timestamp"'im_/https://pbs.twimg.com/media/[A-Za-z0-9_-]\+\.[a-zA-Z0-9]\+')

        case "$original" in
            https://*|http://*)
                if [ -z "$imageURLs" ]; then
                    echo "[$timestamp] No images found."
                else
                    for imageURL in $imageURLs; do
                        echo "[$timestamp] Downloading: $imageURL"
                        wget -nc -P "$1" -q "$imageURL"
                        sleep 2 # avoid archive.org timeouts
                    done
                fi
                ;;
            *)
                echo "Invalid URL: $original"
                ;;
        esac
    done

    touch "$1/GATHERED-BY-TWTARC"
}

# https://gist.github.com/rsms/87570aa1a839ce4884e7d83a3c3dac84
apk-size() {
	apk info -e -s \* >/tmp/apksize
	awk 'NR % 3 == 1' /tmp/apksize | cut -d ' ' -f 1 > /tmp/apkname
	awk 'NR % 3 == 2' /tmp/apksize > /tmp/apksize2

	while read -r n unit; do
	  B=$n
	  case "$unit" in
	    KiB) B=$(( n * 1024 )) ;;
	    MiB) B=$(( n * 1024 * 1024 )) ;;
	    GiB) B=$(( n * 1024 * 1024 * 1024 )) ;;
	  esac
	  printf "%12u %4s %-3s\n" $B $n $unit
	done < /tmp/apksize2 > /tmp/apksize

	paste -d' ' /tmp/apksize /tmp/apkname | sort -n -u | cut -c14-
	rm /tmp/apksize /tmp/apksize2 /tmp/apkname
}

dvconv() {
    if [ -z "$1" ]; then
        echo "Usage: dvconv input.avi"
        return 1
    fi
    input_file="$1"
    output_file="${input_file%.*}.mp4"
    ffmpeg -i "$input_file" -vf "yadif" -c:v libx264 -pix_fmt yuv420p -c:a aac -b:a 192k "$output_file"
}

# Improved by AI, improved and fixed further by interloper.
# Downloads an entire website as a WARC. Not reccomended for huge websites. Reccomended for small blog sites.
quick-warc() {
        if [ -z "$1" ]; then
                echo "usage: quick-warc <url>"
                return 1
        fi

        user_agent="Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27"
        warc_base="$(printf '%s' "$1" | sed 's/\W/_/g')"
        warc_file="${warc_base}.warc.gz"
        dest_dir=/sym/Root/Software/WARCs
        mkdir -p "$dest_dir"

        if [ -f "${dest_dir}/${warc_base}" ]; then
                echo "$warc_file already exists in $dest_dir"
                return 1
        else
                wget --warc-file="$warc_base" \
                        --warc-cdx \
                        --mirror \
                        --page-requisites \
                        --no-check-certificate \
                        --restrict-file-names=windows \
                        -e robots=off \
                        --waitretry 5 \
                        --timeout 60 \
                        --tries 5 \
                        --wait 1 \
                        -U "$user_agent" \
                        "$1"

                # Move the generated WARC and CDX files to the designated folder
                if [ -f "$warc_file" ]; then
                        mv "$warc_file" "$dest_dir/"
                        mv "${warc_base}.cdx" "$dest_dir/"
                        test -d "$1" && rm -rf "$1"
                else
                        echo "Couldn't move ${warc_file}. Does it exist?"
                fi
                return 0
        fi
}

videocheck() {
    # Video list file
    videoList="localIDs.txt"
    if [ "$1" = "old" ]; then
        # Extract video IDs from filenames in the old format
        find . -maxdepth 1 -regex '.*\.\(mkv\|mp4\|webm\)' | sed -En '/.*-[A-Za-z0-9_-]{11}\.[^.]+$/ s/.*-([A-Za-z0-9_-]{11})\.[^.]+$/\1/p' | uniq > "$videoList"
    else
        # Extract video IDs from filenames with square brackets
        find . -maxdepth 1 -regex '.*\.\(mkv\|mp4\|webm\)' | grep '\[[^]]\{11\}\]' | sed -E 's/.*\[([^]]{11})\].*/\1/' | uniq > "$videoList"
    fi

    # Base URL for YouTube videos
    baseUrl="https://www.youtube.com/watch?v="

    # Loop through each line in the video list
    while read -r videoId; do
        # Skip empty lines
        if [ -z "$videoId" ]; then
            echo "Skipping empty line."
            continue
        fi

        # Construct the full video URL
        videoUrl="${baseUrl}${videoId}"

        # Run yt-dlp to simulate fetching video info (without actually downloading it)
        if [ -n "${DEBUG_SCRIPT}" ]; then # Set DEBUG_SCRIPT environmental variable to anything to get full output
            output=$(yt-dlp --simulate "$videoUrl")
        else
            output=$(yt-dlp --simulate --cookies <cookies> "$videoUrl" 2>&1)
        fi

        # Check for specific strings in the output
        if echo "$output" | grep -q "inappropriate"; then
            echo "OK (18+): $videoId"
        elif echo "$output" | grep -q "terminated"; then
            echo "TERMINATED ACCOUNT: $videoUrl"
        elif echo "$output" | grep -q "private"; then
            echo "PRIVATE: $videoUrl"
        elif echo "$output" | grep -q "copyright claim"; then
            echo "COPYRIGHT CLAIMED: $videoUrl"
        elif echo "$output" | grep -q "Video unavailable"; then
            echo "Video unavailable: $videoUrl"
        elif [ -n "${DEBUG_SCRIPT}" ]; then
            echo "$output"
        else
            echo "OK: $videoId"
        fi
    done < "$videoList"
}