Bash script to download and search youtube subtitles and output clickable timestamped urls
Bash script to download and search youtube subtitles and output clickable timestamped urls
Here is the script.
undefined
#!/usr/bin/env bash # Download and search youtube subs # deps yt-dlp ,awk, perl, any one or more of either ugrep, ripgrep, grep # usage "script youtube_url" main() { url="$@" check_if_url get_video_id search_for_downloaded_matching_files set_download_boolean_flag download_subs read_and_format_transcript_file echo_description_file user_search } # Iterate over the array and add items to the new array if they match the regex check_if_url() { local regex='^https://[^[:space:]]+$' if ! [[ $url =~ $regex ]]; then echo "Invalid input. Valid input is a url matching regex ${regex}" exit 1 fi } get_video_id() { video_id=$(echo "$url" | sed -n 's/.*v=\([^&]*\).*/\1/p') } search_for_downloaded_matching_files() { # Find newest created files matching the video_id transcript_file="$( /usr/bin/ls -t --time=creation "$PWD"/*${video_id}*\.vtt 2>/dev/null | head -n 1 )" description_file="$( /usr/bin/ls -t --time=creation "$PWD"/*${video_id}*\.description 2>/dev/null | head -n 1 )" } set_download_boolean_flag() { if [ -n "$transcript_file" ] && [ -n "$description_file" ]; then download=0 # FALSE else download=1 # TRUE fi } download_subs() { if [ "$download" -eq 1 ]; then yt-dlp --restrict-filenames --write-auto-sub --skip-download "${url}" yt-dlp --restrict-filenames --sub-langs=eng --write-subs --skip-download "${url}" yt-dlp --restrict-filenames --write-description --skip-download "${url}" # Search files again since they were just downloaded search_for_downloaded_matching_files fi } read_and_format_transcript_file() { perl_removed_dupes="$(perl -0777 -pe 's/^\d\d.*\n.*\n.*<\/c>//gm' <"${transcript_file}")" local prefix="https://www.youtube.com/watch?v=${video_id}&t=" local suffix="s" formated_transcript_file="$(awk -v pre="$prefix" -v suf="$suffix" ' /^([0-9]{2}:){2}[0-9]{2}\.[0-9]{3}/ { split($1, a, /[:.]/); $1 = pre (int(a[1]*3600 + a[2]*60 + a[3]) - 3) suf; sub(/ --> [0-9]{2}:[0-9]{2}:[0-9]{2}\.[0-9]{3}/, ""); sub(/ align:start position:0%$/, ""); print; next; } { sub(/ align:start position:0%$/, ""); print; } ' <<<"${perl_removed_dupes}")" #CRLF for ugrep to avoid ?bug? where before lines are not all outputted formated_transcript_file_CRLF=$(printf '%b' "$formated_transcript_file" | sed 's/$/\r/') } echo_description_file() { cat "${description_file}" } user_search() { echo -e "\n\n" read -rp "Enter regex (read as raw input): " search_term : ${app_count:=0} if command -v ug >/dev/null 2>&1; then echo -e "\n\n\n\n" echo "Ugrep output" ug --pretty=never -B2 -A1 -i -Z+-~1 -e "${search_term}" --andnot "^https?:\/\/" <<<"$formated_transcript_file_CRLF" ((app_count++)) fi if command -v rg >/dev/null 2>&1; then echo -e "\n\n\n\n" echo "Ripgrep output" rg -iP -B2 -A7 "^(?!https?:\/\/).*\K${search_term}" <<<"$formated_transcript_file" ((app_count++)) fi if [ "$app_count" -eq 0 ]; then echo -e "\n\n\n\n" echo "Grep output" grep -iP -B2 -A1 "${search_term}" <<<"$formated_transcript_file" echo -e "\n\n" echo "Consider installing ripgrep and ugrep for better search" ((app_count++)) fi } main "$@"