Made a simple scraper to download all the lecture recordings in MDS. Then rename and sort them by courses.
# Python3
import requests
from bs4 import BeautifulSoup
import wget
import os
################
### Download ###
################
# download all the lecture recordings from "http://url-hidden"
url = "http://url-hidden"
rq = requests.get(url)
bs = BeautifulSoup(rq.text, features="lxml")
mp4 = [x.text for x in bs.find_all("li")]
t = len(mp4)
print("Download start.\n{} videos in total.\n".format(t))
n = 1
for m in mp4:
url_dl = url+m
print("[{}/{}] {}".format(n, t, m))
wget.download(url_dl)
print()
n += 1
print("\nDownload finished\n")
################
#### Rename ####
################
# rename files
# e.g. `20190909-090006-008.mp4` to `521-1.mp4`.
id_seq = [[521, 551, 542, 511], [531, 552, 512, 523], [561, 571, 532, 513], [562, 572, 573, 522], [563, 524, 574, 553]] # lecture sequence based on calendar
file_names = []
for x in id_seq:
for i in range(1, 9):
file_names.extend(str(y)+'-'+str(i) for y in x)
# HANDLE EXCEPTIONS:
# 2019-10-18 missing : '531-2', '552-2'
# 2019-10-24 one lecture (523) missing: '523-4'
# 2020-01-17 missing: '562-2', '572-2'
# final week missing (2020-03-16 ~ 2020-03-19): [563, 524, 574, 553] - 7&8
miss_rec = ['531-2', '552-2', '523-4', '562-2', '572-2', '563-7', '524-7', '574-7', '553-7', '563-8', '524-8', '574-8', '553-8']
file_names = [x for x in file_names if x not in miss_rec] # drop missing records
i = file_names.index('531-1')
file_names[i: i+4] = ['512-1', '523-1', '531-1', '552-1'] # course order is different in that week
print("Renaming files:\n")
i = 0
log = []
for x in os.listdir():
if x[-4:] == '.mp4':
os.rename(x, file_names[i]+'.mp4')
rec = x+' to '+file_names[i]+'.mp4'
print(rec)
log.append(rec)
i += 1
# save the rename log in a text file
f = open('rename_log.txt','w')
for ele in log:
f.write(ele+'\n')
f.close()
# save the missing recordings
f = open('missing_recordings.txt','w')
for ele in miss_rec:
f.write(ele+'\n')
f.close()
print('\nRenaming completed. Log saved as `rename_log.txt` and `missing_recordings.txt`.')
################
#### Folder ####
################
# create folders by course and move recordings into folders accordingly
# e.g.: `551-1.mp4` moved into `DSCI_551_Descriptive Statistics and Probability for Data Science`
# get the course ID and names to create folders
# course_html = requests.get("https://courses.students.ubc.ca/cs/courseschedule?pname=subjarea&tname=subj-department&dept=DSCI")
course_html = requests.get("https://courses.students.ubc.ca/cs/courseschedule?tname=subj-department&sessyr=2020&sesscd=W&dept=DSCI&pname=subjarea")
course_bs = BeautifulSoup(course_html.text, features="lxml")
l = [x.text for x in course_bs.find_all('td')][2: -4]
id_name_dict = {}
id_list = l[::2]
id_list = [x.replace(" ", "_") for x in id_list]
name_list = l[1::2]
# name_list = [x.replace(" ", "-") for x in name_list]
for i in range(int(len(l)/2)):
full_name = id_list[i]+'_'+name_list[i]
id_name_dict[id_list[i][-3:]] = full_name
# create folders for each course
for x in id_name_dict.values():
os.mkdir(x)
# move recordings into the folders they belong
print("\nMoving files into folders: \n")
log = []
for x in os.listdir():
if x[-4:] == '.mp4':
os.rename(x, str(id_name_dict[x[:3]])+'/'+x)
rec = str(x)+' moved into '+str(id_name_dict[x[:3]])
print(rec)
log.append(rec)
# save the log in a text file
f = open('folderize_log.txt','w')
for ele in log:
f.write(ele+'\n')
f.close()
print('\nCompleted. Log saved as `folderize_log.txt`.')