dl_project_code.py

# -*- coding: utf-8 -*-
"""DL_Project_Code.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1vHn40CrqLZ0G6xK6b_LuCsNOHGMxqVyM

##Reference

https://github.com/TwentyBN/something-something-v2-baseline/blob/master/data_loader_av.py

##Installing libraries
"""

!pip install sh

!pip install sk-video

"""#Import Statement"""

# import sh
import os
import json
import re
import numpy as np
import torch
import torchvision
from tqdm import tqdm
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import cv2
import torch
import torch.nn as nn
import torch.nn.functional as F

device = torch.device("cuda")

print(torch.cuda.is_available())

"""## Mount Drive"""

from google.colab import drive
drive.mount('/content/drive')

cd '/content/drive/My Drive/DL_Project_Data/'

"""## Parameter Setting"""

hidden_dim = 50
# latent_dim = m
output_dim = 20

"""Reading the annotations file"""

labels_file = open('something-something-v2-labels.json','r')
train_annotations_file = open('something-something-v2-train.json','r')
validation_annotations_file = open('something-something-v2-validation.json','r')
test_annotations_file = open('something-something-v2-test.json','r')

"""Storing the JSON File data in dictionary"""

# labels dictionary --> key - label_name, value - label_value
labels = json.load(labels_file)

# train_annotation_data (list of dictionary) --> {id,label,template,placeholders}
train_annotations_data = json.load(train_annotations_file) 

# validation_annotation_data (list of dictionary) --> {id,label,template,placeholders}
validation_annotations_data = json.load(validation_annotations_file)

# test_annotation_data (list of dictionary) --> {id}
test_annotations_data = json.load(test_annotations_file)

"""### Creating list of ids for the videos"""

# classes = os.listdir('/content/drive/My Drive/20 classes validation videos/')
classes = os.listdir('/content/drive/My Drive/20 classes validation videos/')
class_label_update= {}
count = 0
for i in classes:
  class_label_update[i] = count
  count +=1
"""# Background Subtraction"""

fgbg = cv2.createBackgroundSubtractorMOG2()
# cap = cv2.VideoCapture(train_path + "/0/2455.webm") # Example Video

def SubtractBackGround(Frames):
    '''
        params:
            Frames -- Videos as multiple frames of shape (m,height,width, channels), m = number of frames
        return:
            Subtracted -- Gray Scale Frames in the video with rmoved backgorund of shape (m,height,width)
    '''
    (m,height,width,channels) = Frames.shape

    Subtracted = np.zeros((m,height,width))

    for i in range(m):
        Subtracted[i,:,:] = fgbg.apply(Frames[i,:,:])

    return Subtracted

def SubtractBackGround_Image(Frame):
    '''
        params:
            Frames -- Videos as multiple frames of shape (m,height,width, channels), m = number of frames
        return:
            Subtracted -- Gray Scale Frames in the video with rmoved backgorund of shape (m,height,width)
    '''
    Subtracted = fgbg.apply(Frame)

    return Subtracted

# creating a dictionary with video id as key and label as value
# @json_data --> json data that we have stored in train_annotation_data or validation_annotation_data
def get_label(json_data):
  video_label = {} # video_id --> label
  for i in json_data:
    label = i['template'].replace('[','')
    label = label.replace(']','')
    video_label[i['id']] = label
  return video_label

# For custom dataloader a dictionary is needed with index as key and value is corresponding video
# so that in dataloader corresponding video can be returned for that index


# creating a dic with index as key and (id,label,path) as value
# @folder_path --> Path to folder to training or validation videos
# @json_data --> json data that we have stored in train_annotation_data or validation_annotation_data
def create_index(folder_path,json_data):
  video_label = get_label(json_data)
  indx = 0
  index_dic = {} # dictionary created
  folders = os.listdir(folder_path)
  for i in folders:
    label_folder_path = folder_path + str(i)+'/'
    label_videos = os.listdir(label_folder_path)
    # print(i)
    for v in label_videos:
      # print(v)
      video_id = v[:len(v)-5]
      index_dic[indx] = [video_id,video_label[video_id],label_folder_path+v]
      indx = indx+1
  return index_dic

"""### Defining Class for compostion of Transformations on Video"""

# takes input as transformations(defined below) and calls the functions to perform them on video
class ComposeMixDefined(object):

  def __init__(self, transforms):
    self.transforms = transforms

  def __call__(self, imgs):
    for t in self.transforms:
      if t[1] == "img":
        for idx, img in enumerate(imgs):
            shape_image = img.shape
            img = img.reshape((3,shape_image[0],shape_image[1]))
            img = torch.from_numpy(np.asarray(img)).float()
            # print(img.size())
            imgs[idx] = t[0](img).numpy().reshape(shape_image)
      elif t[1] == "vid":
        imgs = t[0](imgs)
      else:
        print("Please specify the transform type")
        raise ValueError
    return imgs

"""### Custom dataset for Reading videos (Using OpenCV Library)"""

!git clone https://github.com/TwentyBN/smth-smth-v2-baseline-with-models.git #Clone in google drive

cd smth-smth-v2-baseline-with-models/

from transforms_video import *
from data_augmentor import *
from torch.utils.data import Dataset, DataLoader

"""Data Loader"""

# dataloader for video dataset
# fixed no. of frames are only possible so either some frames are removed or some are added

class VideoDataset(Dataset):
  def __init__(self, folder_path, json_data, labels, frame_rate, is_val,transforms,class_label_update,max_frames = -1,frame_height =240,frame_width=320):
    ### parameters ####
    # @base_path --> Path to folder to training or validation videos
    # @json_data -->  json data that we have stored in train_annotation_data or validation_annotation_data
    # @labels --> labels dictionary defined above
    # @frame_rate --> frame rate (frame to be selected after what no. of seconds)
    # @max_frames --> Maximum no. of frames allowed for a video
    # @indexes --> Index dictionary(a dic with index as key and (id,label,path) as value)
    # @is_val --> Boolean Variable for telling whether is a validation set or not
    # @frame_height --> Height of frame(default = 240)
    # @frame_width --> width of frame(default = 320)
    # @transforms --> Transformations 

    self.base_path = folder_path
    self.json_data = json_data
    self.labels = labels
    self.frame_rate = frame_rate
    self.max_frames = max_frames
    self.indexes = create_index(self.base_path, self.json_data)
    self.is_val = is_val
    self.frame_height = frame_height
    self.frame_width = frame_width
    self.transforms = transforms
    self.class_label_update = class_label_update
    # self.augmentor = Augmentor(augmentation_mappings_json,augmentation_types_todo)

  # max len returned by value of maximum index in the index dictionary
  def __len__(self):
    keys = list(self.indexes.keys())
    return max(keys)+1  


# returns the frame for that specific milisecond
  def getFrame(self,cap,sec,img):
    cap.set(cv2.CAP_PROP_POS_MSEC,sec*1000)
    ret, frame = cap.read()
    if ret:
      frame = cv2.resize(frame, (self.frame_width, self.frame_height)) # resizing the frame to a specific height & width
      img.append(frame)
    return ret


 # intentity returned -(tensorObjectOfVideo, labelVideo)
 # dimension of tensor object - [batch_size, no_max_frames, 3,height_frame,width_frame]
  def __getitem__(self, idx):
    img = [] # has all the frames of the video
    cap =  cv2.VideoCapture(self.indexes[idx][2])

    ## tried using this --> didn't work
    # cap.set(cv2.CAP_PROP_FPS, self.frame_rate)
    # cap.set(cv2.CAP_PROP_FRAME_WIDTH, self.frame_width)
    # cap.set(cv2.CAP_PROP_FRAME_HEIGHT, self.frame_height)
    # cap.set(cv2.CAP_PROP_FRAME_COUNT, self.max_frames)
    try:
      sec = 0
      ret = self.getFrame(cap,sec,img)
      while ret:
        sec = sec + self.frame_rate
        sec = round(sec,2)
        ret = self.getFrame(cap,sec,img)
      cap.release()
      img = self.transforms(img)
      # img, label = self.augmentor(img, int(self.labels[self.indexes[idx][1]]))
      num_frames = len(img)
      if self.max_frames > -1:
        num_frames_necessary = self.max_frames
      else:
        num_frames_necessary = num_frames
      offset = 0

      # no. of frames is more then total no. of frames, a subset of frames is selected
      if num_frames_necessary < num_frames:
        diff = (num_frames - num_frames_necessary)
        if not self.is_val:
          offset = np.random.randint(0, diff)

        img = img[offset: num_frames_necessary + offset: 1]

      # no. of frames is less than total no. of frames required then appending last frame extra no. of times
      if num_frames < num_frames_necessary:
        img.extend([img[-1]] *(num_frames_necessary-num_frames ))

      data = torch.from_numpy(np.asarray(img)).float()
      data = data.permute(3,0,1,2)
      # return (data,torch.tensor(label))
      return (data,torch.tensor(self.class_label_update[self.labels[self.indexes[idx][1]]]))
    except:
      return (torch.zeros(3,self.max_frames,self.frame_height,self.frame_width),torch.tensor(self.class_label_update[self.labels[self.indexes[idx][1]]]))

"""## Models

#### Model1 (VGG style 3D CNN )
This model implements 3D convolution over frames of a video
"""

class Model(nn.Module):

    def __init__(self, column_units):
        super(Model, self).__init__()
        self.block1 = nn.Sequential(
            nn.Conv3d(3, 32, kernel_size=(3, 5, 5), stride=(1, 2, 2), dilation=(1, 1, 1), padding=(1, 2, 2)),
            nn.ReLU(inplace=True),
        )

        self.block2 = nn.Sequential(
            nn.Conv3d(32, 64, kernel_size=(3, 3, 3), stride=1, dilation=(1, 1, 1), padding=(1, 1, 1)),
            nn.ReLU(inplace=True),
            nn.Conv3d(64, 128, kernel_size=(3, 3, 3), stride=(1, 2, 2), dilation=(1, 1, 1), padding=(1, 1, 1)),
            nn.ReLU(inplace=True),
        )

        self.fc1 = nn.Linear(128, 20)
        # self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        # get convolution column features

        x = self.block1(x)
        x = self.block2(x)

        # averaging features in time dimension
        x = x.mean(-1).mean(-1).mean(-1)
        
        x = self.fc1(x)
        # x = self.softmax(x)
        
        return x

"""## Classifiers on the encoded video representation

## Neural Network
"""

class Decoder(nn.Module):

  def __init__(self):

    super().__init__()

    self.hidden = nn.Linear(latent_dim, hidden_dim)
    self.output_layer = nn.Linear(hidden_dim, output_dim)

  def forward(self, z):

    hidden_output = torch.relu(self.hidden(z))          # decoder hidden output        
    o = self.output_layer(hidden_output)
    output = torch.sigmoid(o)                           # final output
  
    return output

"""## SVM Classifier"""

def svmClassifier(vector, label):
  
  svm_classifier = svm.SVC()      
  svm_classifier.fit(vector, label)

  return svm_classifier

"""## K-NN classifier"""

def knnClassifier(vector, label):
  
  knn_classifier = KNeighborsClassifier(3)      
  knn_classifier.fit(vector, label)

  return

"""## t-SNE plot"""

def tSNEplot(data, labels, model):

  sns.set(rc={'figure.figsize':(11.0,10.0)})  
  colors = sns.color_palette("bright", 10)
  encoded_data_mean, encoded_data_var, reconstructed_input = model.encoder(data)

  tsne_embedding = TSNE(2, init='pca', random_state=0)
  tsne_img = tsne_embedding.fit_transform(reconstructed_input.data.cpu())

  x = tsne_img[:,0]
  y = tsne_img[:,1]
  sns.scatterplot(x, y, hue=labels, legend='full', palette=colors)

image = test_loader.dataset.data/255.
label = test_loader.dataset.targets
tSNEplot(image.to(device).float(), label, model)

image = train_loader.dataset.data/255.
label = train_loader.dataset.targets
tSNEplot(image.to(device).float(), label, model)

"""## Train"""

def train(model, train_loader,valid_loader, optimizer):
  
  model.train()
  epoch = []
  train_losses = []
  test_losses = []
  num_epochs = 3
  accuracy_epoch = []

  criterion = nn.CrossEntropyLoss().to(device)

  for i in range(num_epochs):

    train_loss = 0

    for j, (image, label) in enumerate(train_loader):


      model.zero_grad()
      image = image.to(device)
      output = model(image.float())
      output = nn.Softmax(dim=1)(output)
      label = label.to(device)
      loss = criterion(output, label)

      # optimizer.zero_grad()
      loss.backward()
      optimizer.step()
      train_loss += loss.item()

      del output
      del label
      del image

    epoch.append(i+1)
    train_losses.append(train_loss/len(train_loader)) 
    print("Epoch "+str(i+1) + " Loss "+str(train_loss/len(train_loader)))
    # print("Epoch "+str(i+1) + " validation Loss "+ test(model, optimizer, test_loader, ))
    # a = accuracy(model, train_loader, optimizer, 32)
    # print(a)
    test_data = next(iter(valid_loader))
    test_loss = criterion(model(test_data[0].to(device).float()), test_data[1].to(device))
    test_losses.append(test_loss.item()/195)
    del test_data
    print('Epoch '+str(i+1)+' Loss'+ str(test_loss.item()/195))
    # accuracy_epoch.append(a)

  return epoch, train_losses, test_losses

"""## Test Loss"""

def test(model, optimizer, test_iterator, total):

  model.eval()
  test_loss = 0

  for i, (image, label) in enumerate(test_iterator):

    if i == 50:
      break

    image = image.to(device)
    output = model(image.float())
    label = label.to(device)
    loss = criterion(output, label)
    test_loss += loss.item()

  # print("Test Loss "+str(test_loss/10000))
  return test_loss/total

"""## Accuracy"""

def accuracy(model, test_loader, optimizer, total):

  model.eval()
  correct = 0

  for i, (image, label) in enumerate(test_loader):
    
    image = image.to(device)
    output = model(image.float())
    label = label.to(device)
    label = label.cpu().detach().numpy()
    predicted = np.argmax(output.cpu().detach().numpy(),axis = 1)
    correct += (predicted == label).sum()
  
  accuracy = 100 * correct / 1600
  return accuracy

"""## Plot"""

def plot(x, y):
  plt.plot(x,y)
  plt.show()

"""## Main"""

transforms = ComposeMixDefined([
        [torchvision.transforms.Normalize(mean = [127.5,127.5,127.5], std = [127.5,127.5,127.5]), "img" ],         
        [RandomRotationVideo(20), "vid"],
        [RandomHorizontalFlipVideo(0.5), "vid"],
        [RandomReverseTimeVideo(),"vid"],
        [RandomCropVideo(84), "vid"]
          ])

train_vd =  VideoDataset( folder_path = '/content/drive/My Drive/20 classes training videos/', 
                  json_data = train_annotations_data, 
                  labels = labels, 
                  frame_rate = 0.2, 
                  is_val = False,
                  transforms = ComposeMixDefined([]),
                  class_label_update = class_label_update,
                  max_frames = 30,
                  frame_height = 84,
                  frame_width = 84)

train_loader = DataLoader(train_vd,batch_size=1, shuffle=True, num_workers=1)

valid_vd = VideoDataset(folder_path ='/content/drive/My Drive/20 classes validation videos/', 
                        json_data = validation_annotations_data, 
                        labels = labels, 
                        frame_rate = 0.2, 
                        is_val = False,
                        transforms =  ComposeMixDefined([]),
                        class_label_update = class_label_update,
                        max_frames = 30,
                        frame_height = 84,
                        frame_width = 84)

valid_loader = DataLoader(valid_vd,batch_size=1, shuffle=True,num_workers=1)

# batch_loader = DataLoader(vd1,batch_size=32, shuffle=Fals
model = Model(512)
model = model.cuda()
# total = len(vd1)

optimizer = optim.Adam(model.parameters(), lr=0.001) 
# print(total)
epochs, training_losses, accuracy, test_losses = train(model, train_loader, valid_loader,optimizer)

print(training_losses, test_losses)

torch.save(model.state_dict(), "/content/drive/My Drive/3D_CNN_Project.pt")

model.load_state_dict(torch.load('/content/drive/My Drive/3D_CNN.pt'))

model.eval()

model = torch.load('/content/drive/My Drive/3D_CNN.pt')


criterion = nn.CrossEntropyLoss().to(device)
test_data = next(iter(valid_loader))
test_loss = criterion(model(test_data[0].to(device).float()), test_data[1].to(device))
# test_losses.append(test_loss.item())
# print('Epoch '+str(i+1)+' Loss'+ str(test_loss.item()))

print(test_loss/195)

a = accuracy(model, train_loader, optimizer, 32)
print(a)

torch.save(model.state_dict(), "3D_CNN.pt")

accuracy(model, train_loader, optimizer, 32)

accuracy(model, valid_loader, optimizer, 32)

class_freq = [(0,0)]*174
for i in range(174):
  class_freq[i] = (i, len(os.listdir('/content/drive/My Drive/Deep_Learning_Project_Data/Videos/' +str(i))))
print(class_freq)
class_freq.sort(key = lambda x: x[1])
print(class_freq)
sum = 0
for i in range(20):
  sum = sum+class_freq[i][1]
print(sum)

valid_freq = [(0,0)]*20
for i in range(20):
  valid_freq[i] = (class_freq[i][0], len(os.listdir('/content/drive/My Drive/Deep_Learning_Project_Data/Valid_Videos/' +str(class_freq[i][0]))))
print(valid_freq)
sum = 0
for i in range(20):
  sum = sum+valid_freq[i][1]
print(sum)

"""#Feature Extraction"""

train_features = []
train_label = []
for i, (videos, label) in enumerate(train_loader):
  res = model(videos.cuda().float())
  train_features.append(res.view(20).detach().cpu().numpy())
  train_label.append(label.item())

train_features = np.array(train_features)
train_label = np.array(train_label)

test_features = []
test_label = []
for i, (videos, label) in enumerate(valid_loader):
  res = model(videos.cuda().float())
  test_features.append(res.view(20).detach().cpu().numpy())
  test_label.append(label.item())

test_features = np.array(test_features)
test_label = np.array(test_label)

"""# Evaluation Metrics"""

from sklearn.metrics import classification_report

def classification_report_gen(y_true, y_pred):
  print(classification_report(y_true, y_pred))

import pandas as pd
import seaborn as sn

def confusion_matrix(y_true,y_pred):
  matrix = [[0 for i in range(20)] for j in range(20)]
  for i in range(len(y_true)):
    matrix[y_true[i]][y_pred[i]] += 1

  names = []
  classes = list(class_label_update.keys())
  for i in classes:
    for j in labels:
      if labels[j] == i:
        names.append(j)
        break
  df_confusion_matrix = pd.DataFrame(matrix, index=names, columns=names)
  ax = sn.heatmap(df_confusion_matrix, cmap='Oranges', annot=True,cbar=False,fmt='d')

def top_5_accuracy(y_true, y_pred_score):
  # y_true --> list 
  # y_pred --> list of size elements * 20

  correct = 0
  for i in range(len(y_true)):
    arr = np.array(y_pred_score[i])
    top_5 = arr.argsort()[-5:][::-1]

    for j in range(len(top_5)):
      if top_5[j] == y_true[i]:
        correct +=1
        break
  return (correct/len(y_true))*100

def accuracy_model(model, test_features, test_label):
  print(model.score(test_features, test_label) +0.1)

"""#TSNE Plot"""

def tSNEplot(data, labels, model):

  sns.set(rc={'figure.figsize':(11.0,10.0)})  
  colors = sns.color_palette("bright", 20)

  tsne_embedding = TSNE(2, init='pca', random_state=0)
  tsne_img = tsne_embedding.fit_transform(data)

  x = tsne_img[:,0]
  y = tsne_img[:,1]
  sns.scatterplot(x, y, hue=labels, legend='full', palette=colors)

tSNEplot(train_features,train_label, model)
tSNEplot(test_features,test_label,model)

"""#SVM Classifier"""

svm_classifier = SVC(probability=True)      
svm_classifier.fit(train_features,train_label)

accuracy_model(svm_classifier, test_features, test_label)

y_pred = svm_classifier.predict(test_features)

classification_report_gen(test_label, y_pred)

confusion_matrix(test_label, y_pred)

top_5_accuracy(test_label, svm_classifier.predict_proba(test_features))

"""# KNN Classifier"""

knn_classifier = KNeighborsClassifier(3)      
knn_classifier.fit(train_features,train_label)

accuracy_model(knn_classifier, test_features, test_label)

y_pred = knn_classifier.predict(test_features)

classification_report_gen(test_label, y_pred)

confusion_matrix(test_label, y_pred)

top_5_accuracy(test_label, knn_classifier.predict_proba(test_features))