Skip to content

Commit

Permalink
use oauth in youtube fetches to avoid being a bot
Browse files Browse the repository at this point in the history
  • Loading branch information
awong-dev committed Feb 13, 2025
1 parent d7b57b4 commit 7607a2d
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 5 deletions.
13 changes: 10 additions & 3 deletions functions-python/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,13 @@


def _access_secret_version(project_id, secret_id, version_id="latest"):
client = secretmanager.SecretManagerServiceClient()
secret_client = secretmanager.SecretManagerServiceClient()

# Build the resource name of the secret version
name = f"projects/{project_id}/secrets/{secret_id}/versions/{version_id}"

# Access the secret version
response = client.access_secret_version(request={"name": name})
response = secret_client.access_secret_version(request={"name": name})

# Decode the secret payload
payload = response.payload.data.decode("UTF-8")
Expand Down Expand Up @@ -57,6 +57,12 @@ def start_transcribe(
_access_secret_version('sps-by-the-numbers', 'vast_api_key'),
raw=True)

yt_refresh_token = _access_secret_version('sps-by-the-numbers',
'youtube_oauth')

hf_token = _access_secret_version('sps-by-the-numbers',
'huggingface')

target_num_instances = max(1, int(num_new_videos / VIDS_PER_MACHINE))

# Do not create new instance if one is running.
Expand Down Expand Up @@ -98,7 +104,8 @@ def start_transcribe(
onstart_cmd=("env | grep _ >> /etc/environment; "
"nohup /workspace/app/onstart_hook.sh "
f"{int(cheapest['cpu_cores_effective'])} "
"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 "
f"{hf_token} "
f"{yt_refresh_token} "
f"{lysine_timeout} 10 &"),
disk=DISK_GB,
args="",
Expand Down
2 changes: 1 addition & 1 deletion tools/process_new_vids/onstart_hook.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@ eval "$(fnm env --use-on-cd --shell bash)"
fnm use lts/latest

# Default to giving 10 mins on failure.
/workspace/app/lysine_protocol.sh "${3-:30}" & python /workspace/app/transcribe_worker.py -w /tmp/transcribe -t "${1:-4}" -x "$2" -m large-v3 -c -s; /workspace/app/lysine_protocol.sh "${4:-10}"
/workspace/app/lysine_protocol.sh "${4-:30}" & python /workspace/app/transcribe_worker.py -w /tmp/transcribe -t "${1:-4}" -x "$2" -y "$3" -m large-v3 -c -s; /workspace/app/lysine_protocol.sh "${5:-10}"
24 changes: 23 additions & 1 deletion tools/process_new_vids/transcribe_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@

logger = logging.getLogger(__name__)

YT_TOKEN_FILE = './yt_token.json'

WORKING_DIR = '/tmp/workspace/app/transcribe'
AUTH_PARAMS = {
'user_id': os.environ['CONTAINER_ID'],
Expand Down Expand Up @@ -45,7 +47,22 @@ def get_vid_list():
return response.json()['data']


def write_token_file(refresh_token):
token_data = {
"access_token": "dummy",
"refresh_token": refresh_token,
"expires": 0,
"visitorData": None,
"po_token": None}

with open(YT_TOKEN_FILE, "w") as f:
json.dump(token_data, f)


def process_vids(vid_list, args):
# Setup the token file
write_token_file(args.yt_token)

for category, video_id in vid_list:
try:
logger.info(f"Processing {category} {video_id}")
Expand All @@ -69,7 +86,8 @@ def process_vids(vid_list, args):
# Download the audio file.
outfile_name = f"{video_id}.mp4"
video = YouTube(f"https://www.youtube.com/watch?v={video_id}",
"WEB")
use_oauth=True, allow_oauth_cache=True,
token_file=YT_TOKEN_FILE)
audio_streams = video.streams.filter(
only_audio=True).order_by('abr')
audio_streams.first().download(
Expand Down Expand Up @@ -157,6 +175,10 @@ def main():
metavar="HF_TOKEN", type=str,
help='Hugging Face token',
required=True)
parser.add_argument('-y', '--yt_token', dest='yt_token',
metavar="YT_TOKEN", type=str,
help='Youtube Oauth Refresh Token',
required=True)
parser.add_argument('-m', '--model', dest='model', metavar="MODEL",
type=str, help='Downloads whisper MODEL',
default="large-v3-turbo")
Expand Down

0 comments on commit 7607a2d

Please sign in to comment.