-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfileopener.py
62 lines (53 loc) · 2.54 KB
/
fileopener.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import json
import os
from pytube import YouTube
from concurrent.futures import ThreadPoolExecutor, as_completed
# Function to read JSON file and extract YouTube links
def read_json_and_extract_links(file_path):
with open(file_path, 'r') as file:
data = json.load(file)
print(f"Loaded data from {file_path}:")
print(json.dumps(data, indent=2)) # Print the JSON data to inspect its structure
youtube_links = []
for item in data:
if 'url' in item:
youtube_links.append(item['url'])
else:
print(f"Warning: 'url' key not found in {item}")
return youtube_links
# Function to download a single video
def download_video(link, output_directory):
try:
yt = YouTube(link)
video = yt.streams.get_highest_resolution()
# Construct the expected filename
filename = video.default_filename
file_path = os.path.join(output_directory, filename)
if not os.path.exists(file_path):
print(f"Downloading {link} to {output_directory}")
video.download(output_directory)
else:
print(f"Skipping {link} (already downloaded)")
except Exception as e:
print(f"Error downloading {link}: {e}")
# Function to download videos from YouTube links using multithreading
def download_videos_multithreaded(youtube_links, output_directory, max_workers=4):
if not os.path.exists(output_directory):
os.makedirs(output_directory)
with ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = [executor.submit(download_video, link, output_directory) for link in youtube_links]
for future in as_completed(futures):
future.result()
# Directory containing JSON files
json_directory = r'E:\Semester 5\Integrated Design Project\Test\Dataset'
train_json_path = os.path.join(json_directory, 'MSASL_train.json')
validate_json_path = os.path.join(json_directory, 'MSASL_val.json')
test_json_path = os.path.join(json_directory, 'MSASL_test.json')
# Read JSON files and extract YouTube links
train_links = read_json_and_extract_links(train_json_path)
validate_links = read_json_and_extract_links(validate_json_path)
test_links = read_json_and_extract_links(test_json_path)
# Download videos to specific directories
download_videos_multithreaded(train_links, 'videos/train', max_workers=8)
download_videos_multithreaded(validate_links, 'videos/validate', max_workers=8)
download_videos_multithreaded(test_links, 'videos/test',max_workers=8)