cmysmiaczxotoy

joined 2 years ago
MODERATOR OF
[–] [email protected] 35 points 8 months ago (14 children)

I needed, I would pay $5 per month in perpetuity for access to Firefox. Fuck google

 

cross-posted from: https://lemm.ee/post/23155648

Here is the script.


#!/usr/bin/env bash
# Download and search youtube subs
# deps yt-dlp ,awk, perl, any one or more of either ugrep, ripgrep, grep
# usage "script youtube_url"


main() {
    url="$@"
    check_if_url
    get_video_id
    search_for_downloaded_matching_files
    set_download_boolean_flag
    download_subs
    read_and_format_transcript_file
    echo_description_file
    user_search
}


# Iterate over the array and add items to the new array if they match the regex
check_if_url() {
    local regex='^https://[^[:space:]]+$'
        if ! [[ $url =~ $regex ]]; then
            echo "Invalid input. Valid input is a url matching regex ${regex}"
            exit 1
        fi
}


get_video_id() {
    video_id=$(echo "$url" | sed -n 's/.*v=\([^&]*\).*/\1/p')
}


search_for_downloaded_matching_files() {
    # Find newest created files matching the video_id
    transcript_file="$(  /usr/bin/ls -t --time=creation "$PWD"/*${video_id}*\.vtt 2>/dev/null | head -n 1  )"
    description_file="$(  /usr/bin/ls -t --time=creation "$PWD"/*${video_id}*\.description 2>/dev/null | head -n 1  )"
}


set_download_boolean_flag() {
    if [ -n "$transcript_file" ] && [ -n "$description_file" ]; then
        download=0 # FALSE
    else
        download=1 # TRUE
    fi
}


download_subs() {
    if [ "$download" -eq 1 ]; then
        yt-dlp --restrict-filenames --write-auto-sub --skip-download "${url}"
        yt-dlp --restrict-filenames --sub-langs=eng --write-subs --skip-download "${url}"
        yt-dlp --restrict-filenames --write-description --skip-download "${url}"
        # Search files again since they were just downloaded
        search_for_downloaded_matching_files
    fi
}


read_and_format_transcript_file() {
    perl_removed_dupes="$(perl -0777 -pe 's/^\d\d.*\n.*\n.*<\/c>//gm' <"${transcript_file}")"
    local prefix="https://www.youtube.com/watch?v=%24%7Bvideo_id%7D&t="
    local suffix="s"
    formated_transcript_file="$(awk -v pre="$prefix" -v suf="$suffix" '
    /^([0-9]{2}:){2}[0-9]{2}\.[0-9]{3}/ {
        split($1, a, /[:.]/);
        $1 = pre (int(a[1]*3600 + a[2]*60 + a[3]) - 3) suf;
        sub(/ --> [0-9]{2}:[0-9]{2}:[0-9]{2}\.[0-9]{3}/, "");
        sub(/ align:start position:0%$/, "");
        print;
        next;
    }
    {
        sub(/ align:start position:0%$/, "");
        print;
    }
    ' <<<"${perl_removed_dupes}")"
    #CRLF for ugrep to avoid ?bug? where before lines are not all outputted
    formated_transcript_file_CRLF=$(printf '%b' "$formated_transcript_file" | sed 's/$/\r/')
}


echo_description_file() {
    cat "${description_file}"
}


user_search() {
    echo -e "\n\n"
    read -rp "Enter regex (read as raw input): " search_term

    : ${app_count:=0}

    if command -v ug >/dev/null 2>&1; then
        echo -e "\n\n\n\n"
        echo "Ugrep output"
        ug --pretty=never -B2 -A1 -i -Z+-~1 -e "${search_term}" --andnot "^https?:\/\/"  <<<"$formated_transcript_file_CRLF"
        ((app_count++))
    fi

    if command -v rg >/dev/null 2>&1; then
        echo -e "\n\n\n\n"
        echo "Ripgrep output"
        rg -iP -B2 -A7 "^(?!https?:\/\/).*\K${search_term}" <<<"$formated_transcript_file"
        ((app_count++))
    fi
    
    if [ "$app_count" -eq 0 ]; then
        echo -e "\n\n\n\n"
        echo "Grep output"
        grep -iP -B2 -A1 "${search_term}" <<<"$formated_transcript_file"
        echo -e "\n\n"
        echo "Consider installing ripgrep and ugrep for better search"
        ((app_count++))
    fi
}


main "$@"


    
 

Here is the script.


#!/usr/bin/env bash
# Download and search youtube subs
# deps yt-dlp ,awk, perl, any one or more of either ugrep, ripgrep, grep
# usage "script youtube_url"


main() {
    url="$@"
    check_if_url
    get_video_id
    search_for_downloaded_matching_files
    set_download_boolean_flag
    download_subs
    read_and_format_transcript_file
    echo_description_file
    user_search
}


# Iterate over the array and add items to the new array if they match the regex
check_if_url() {
    local regex='^https://[^[:space:]]+$'
        if ! [[ $url =~ $regex ]]; then
            echo "Invalid input. Valid input is a url matching regex ${regex}"
            exit 1
        fi
}


get_video_id() {
    video_id=$(echo "$url" | sed -n 's/.*v=\([^&]*\).*/\1/p')
}


search_for_downloaded_matching_files() {
    # Find newest created files matching the video_id
    transcript_file="$(  /usr/bin/ls -t --time=creation "$PWD"/*${video_id}*\.vtt 2>/dev/null | head -n 1  )"
    description_file="$(  /usr/bin/ls -t --time=creation "$PWD"/*${video_id}*\.description 2>/dev/null | head -n 1  )"
}


set_download_boolean_flag() {
    if [ -n "$transcript_file" ] && [ -n "$description_file" ]; then
        download=0 # FALSE
    else
        download=1 # TRUE
    fi
}


download_subs() {
    if [ "$download" -eq 1 ]; then
        yt-dlp --restrict-filenames --write-auto-sub --skip-download "${url}"
        yt-dlp --restrict-filenames --sub-langs=eng --write-subs --skip-download "${url}"
        yt-dlp --restrict-filenames --write-description --skip-download "${url}"
        # Search files again since they were just downloaded
        search_for_downloaded_matching_files
    fi
}


read_and_format_transcript_file() {
    perl_removed_dupes="$(perl -0777 -pe 's/^\d\d.*\n.*\n.*<\/c>//gm' <"${transcript_file}")"
    local prefix="https://www.youtube.com/watch?v=${video_id}&t="
    local suffix="s"
    formated_transcript_file="$(awk -v pre="$prefix" -v suf="$suffix" '
    /^([0-9]{2}:){2}[0-9]{2}\.[0-9]{3}/ {
        split($1, a, /[:.]/);
        $1 = pre (int(a[1]*3600 + a[2]*60 + a[3]) - 3) suf;
        sub(/ --> [0-9]{2}:[0-9]{2}:[0-9]{2}\.[0-9]{3}/, "");
        sub(/ align:start position:0%$/, "");
        print;
        next;
    }
    {
        sub(/ align:start position:0%$/, "");
        print;
    }
    ' <<<"${perl_removed_dupes}")"
    #CRLF for ugrep to avoid ?bug? where before lines are not all outputted
    formated_transcript_file_CRLF=$(printf '%b' "$formated_transcript_file" | sed 's/$/\r/')
}


echo_description_file() {
    cat "${description_file}"
}


user_search() {
    echo -e "\n\n"
    read -rp "Enter regex (read as raw input): " search_term

    : ${app_count:=0}

    if command -v ug >/dev/null 2>&1; then
        echo -e "\n\n\n\n"
        echo "Ugrep output"
        ug --pretty=never -B2 -A1 -i -Z+-~1 -e "${search_term}" --andnot "^https?:\/\/"  <<<"$formated_transcript_file_CRLF"
        ((app_count++))
    fi

    if command -v rg >/dev/null 2>&1; then
        echo -e "\n\n\n\n"
        echo "Ripgrep output"
        rg -iP -B2 -A7 "^(?!https?:\/\/).*\K${search_term}" <<<"$formated_transcript_file"
        ((app_count++))
    fi
    
    if [ "$app_count" -eq 0 ]; then
        echo -e "\n\n\n\n"
        echo "Grep output"
        grep -iP -B2 -A1 "${search_term}" <<<"$formated_transcript_file"
        echo -e "\n\n"
        echo "Consider installing ripgrep and ugrep for better search"
        ((app_count++))
    fi
}


main "$@"


    
 

I made a script that downloads from youtube super fast using a custom aria2 build.

Aria2 https://github.com/P3TERX/Aria2-Pro-Core/releases

ffmpeg build https://github.com/yt-dlp/FFmpeg-Builds/releases

I choose ffmpeg-master-latest-linux64-gpl.tar.xz

#!/usr/bin/env bash
#set -x

if [[ -z $@ ]]; then
    echo "specify download url"
    exit
fi

dir_dl="$PWD"
url="$@"

ffmpeg_dir="$HOME/.local/bin.notpath/"
download_archive_dir="$HOME/Videos/yt-dlp/"
download_archive_filename=".yt-dlp-archived-done.txt"

mkdir -p "$download_archive_dir"

youtube_match_regex='^.*(youtube[.]com|youtu[.]be|youtube-nocookie[.]com).*$'

if [[ "$1" =~ $youtube_match_regex ]]; then
    url="$(echo "$@" | perl -pe 's/((?:http:|https:)*?\/\/(?:www\.|)(?:youtube\.com|m\.youtube\.com|youtu\.|#youtube-nocookie\.com).*(?:c(?:hannel)?\/|u(?:ser)?\/|v=|v%3D|v\/|(?:a|p)\/(?:a|u)\/\d.*\/|watch\?|vi(?:=|\/)|\/#embed\/|oembed\?|be\/|e\/)([^&amp;?%#\/\n]+)).*/$1/gm')"
    yt-dlp \
    --check-formats \
    --clean-info-json \
    --download-archive "$download_archive_dir$download_archive_filename" \
    --embed-chapters \
    --embed-info-json \
    --embed-metadata \
    --embed-thumbnail \
    --external-downloader aria2c \
    --downloader-args \
    "aria2c: \
        --allow-piece-length-change=true \
        --check-certificate=false \
        --console-log-level=notice \
        --content-disposition-default-utf8=true \
        --continue=true \
        --disk-cache=8192 \
        --download-result=full \
        --enable-mmap \
        --file-allocation=falloc \
        --lowest-speed-limit=100K \
        --max-concurrent-downloads=16 \
        --max-connection-per-server=64 \
        --max-mmap-limit=8192M \
        --max-resume-failure-tries=5 \
        --max-file-not-found=2 \
        --max-tries=3 \
        --min-split-size=64K \
        --no-file-allocation-limit=8192M \
        --piece-length=64k \
        --realtime-chunk-checksum=false \
        --retry-on-400=true \
        --retry-on-403=true \
        --retry-on-406=true \
        --retry-on-unknown=true \
        --retry-wait=1 \
        --split=32 \
        --stream-piece-selector=geom \
        --summary-interval=0 " \
    --ffmpeg-location "$ffmpeg_dir" \
    --output "$dir_dl"'/%(channel)s/%(title)s_%(channel)s_%(upload_date>%Y-%m-%d)s_%(duration>%H-%M-%S)s_%(resolution)s.%(ext)s' \
    --prefer-free-formats \
    --remux-video mkv \
    --restrict-filenames \
    --sponsorblock-remove "filler,interaction,intro,music_offtopic,outro,preview,selfpromo,sponsor" \
    --sub-langs "en.*,live_chat" \
    --write-auto-subs \
    --write-description \
    --write-info-json \
    --write-playlist-metafiles \
    --write-subs \
    --write-thumbnail \
    "$url"
else
    yt-dlp \
    --download-archive "$download_archive_dir$download_archive_filename" \
    --embed-chapters \
    --ffmpeg-location "$ffmpeg_dir" \
    --http-chunk-size 10M \
    --output "$dir_dl/%(title)s_%(duration>%H-%M-%S)s_%(upload_date>%Y-%m-%d)s_%(resolution)s_URL_(%(id)s).%(ext)s" \
    --prefer-free-formats \
    --restrict-filenames \
    "$url"
fi