UCF数据集下载地址:https://www.crcv.ucf.edu/data/UCF101.php

1.推理效果与项目配置

执行inference.py，需要指定3个参数，第一个是标签文件地址，存储了各个标签的含义，第二个是权重文件地址，第三个是要进行推理的视频文件地址

2.训练配置参数

mypath.py 需要修改的参数，root_dir为UCF-101数据集存放的地址，output_dir为保存中间数据的地址，在数据预处理时，需要对视频截取为小片段，并进行数据增强，在时间和空间上进行抖动。

训练参数：

另外需要注意的是，在dataset.py文件中，第一次运行模型需要对数据进行预处理，因此第一次执行需要将preprocess=True，而在后续操作，保存了数据中间结果，可以将preprocess=False

3.视频数据读取

（1）.视频数据预处理操作

首先，对于UCF数据集，每一个文件夹包含了一个类别行为的视频。因此，对于每一个类别的文件夹的视频，划分训练集、测试集与验证集，同时对视频进行切分。返回图像序列。具体切分方法为：

首先读取视频,并返回视频的帧数量、画面宽度和高度

相邻的帧比较相似,不是特征有必要,因此每隔4帧采样一帧,但是需要保证输入图片序列至少有16帧,对采样不满16帧的视频,降低间隔

将每一帧的图片resize到指定大小

（2）数据batch制作

首先，读取视频预处理生成的图像序列，并返回到缓存中，然后对图片和图片序列进行截取，最终图像序列由16帧图片组成，每张图片大小为112*112.最后，将图像序列数据进行正规化并转换为tensor格式，并得到标签。

整体代码如下:

class VideoDataset(Dataset):r"""A Dataset for a folder of videos. Expects the directory structure to bedirectory->[train/val/test]->[class labels]->[videos]. Initializes with a listof all file names, along with an array of labels, with label being automaticallyinferred from the respective folder names.Args:dataset (str): Name of dataset. Defaults to 'ucf101'.split (str): Determines which folder of the directory the dataset will read from. Defaults to 'train'.clip_len (int): Determines how many frames are there in each clip. Defaults to 16.preprocess (bool): Determines whether to preprocess dataset. Default is False."""# 注意第一次要预处理数据的def __init__(self, dataset='ucf101', split='train', clip_len=16, preprocess=True):# 数据路径/视频预处理结果存储路径self.root_dir, self.output_dir = Path.db_dir(dataset)# train/val/test 文件夹folder = os.path.join(self.output_dir, split)# 视频序列长度self.clip_len = clip_lenself.split = split # train/val/test# The following three parameters are chosen as described in the paper section 4.1self.resize_height = 128self.resize_width = 171self.crop_size = 112if not self.check_integrity():raise RuntimeError('Dataset not found or corrupted.' +' You need to download it from official website.')# 视频数据预处理if (not self.check_preprocess()) or preprocess:print('Preprocessing of {} dataset, this will take long, but it will be done only once.'.format(dataset))self.preprocess()# Obtain all the filenames of files inside all the class folders# Going through each class folder one at a timeself.fnames, labels = [], []for label in sorted(os.listdir(folder)):for fname in os.listdir(os.path.join(folder, label)):self.fnames.append(os.path.join(folder, label, fname))labels.append(label)assert len(labels) == len(self.fnames)print('Number of {} videos: {:d}'.format(split, len(self.fnames)))# Prepare a mapping between the label names (strings) and indices (ints)self.label2index = {label: index for index, label in enumerate(sorted(set(labels)))}# Convert the list of label names into an array of label indicesself.label_array = np.array([self.label2index[label] for label in labels], dtype=int)if dataset == "ucf101":if not os.path.exists('dataloaders/ucf_labels.txt'):with open('dataloaders/ucf_labels.txt', 'w') as f:for id, label in enumerate(sorted(self.label2index)):f.writelines(str(id+1) + ' ' + label + '\n')elif dataset == 'hmdb51':if not os.path.exists('dataloaders/hmdb_labels.txt'):with open('dataloaders/hmdb_labels.txt', 'w') as f:for id, label in enumerate(sorted(self.label2index)):f.writelines(str(id+1) + ' ' + label + '\n')def __len__(self):return len(self.fnames)#需要重写__getitem__方法def __getitem__(self, index):# Loading and preprocessing.# 读取图像,并返回到缓存中buffer = self.load_frames(self.fnames[index]) #一共有8460个文件夹# 对图片和图片序列进行截取,最终返回16,3,112,112的图片序列buffer = self.crop(buffer, self.clip_len, self.crop_size)# 标签文件labels = np.array(self.label_array[index])if self.split == 'test':# Perform data augmentationbuffer = self.randomflip(buffer)buffer = self.normalize(buffer) # 正规化buffer = self.to_tensor(buffer) # 转化为tensor格式return torch.from_numpy(buffer), torch.from_numpy(labels)def check_integrity(self):if not os.path.exists(self.root_dir):return Falseelse:return Truedef check_preprocess(self):# TODO: Check image size in output_dirif not os.path.exists(self.output_dir):return Falseelif not os.path.exists(os.path.join(self.output_dir, 'train')):return Falsefor ii, video_class in enumerate(os.listdir(os.path.join(self.output_dir, 'train'))):for video in os.listdir(os.path.join(self.output_dir, 'train', video_class)):video_name = os.path.join(os.path.join(self.output_dir, 'train', video_class, video),sorted(os.listdir(os.path.join(self.output_dir, 'train', video_class, video)))[0])image = cv2.imread(video_name)if np.shape(image)[0] != 128 or np.shape(image)[1] != 171:return Falseelse:breakif ii == 10:breakreturn True# 视频数据预处理def preprocess(self):# 文件夹不存在/创建文件夹if not os.path.exists(self.output_dir):os.mkdir(self.output_dir)os.mkdir(os.path.join(self.output_dir, 'train'))os.mkdir(os.path.join(self.output_dir, 'val'))os.mkdir(os.path.join(self.output_dir, 'test'))#-----------------------------------------------------------------------------------------------------## Split train/val/test sets# 对于每一个文件夹（类别)的视频,划分训练集、测试集、验证集、#-----------------------------------------------------------------------------------------------------#for file in os.listdir(self.root_dir):file_path = os.path.join(self.root_dir, file)video_files = [name for name in os.listdir(file_path)]train_and_valid, test = train_test_split(video_files, test_size=0.2, random_state=42)train, val = train_test_split(train_and_valid, test_size=0.2, random_state=42)train_dir = os.path.join(self.output_dir, 'train', file)val_dir = os.path.join(self.output_dir, 'val', file)test_dir = os.path.join(self.output_dir, 'test', file)if not os.path.exists(train_dir):os.mkdir(train_dir)if not os.path.exists(val_dir):os.mkdir(val_dir)if not os.path.exists(test_dir):os.mkdir(test_dir)#------------------------------------------------## 对训练集、测试集、验证集的视频进行切分，得到图像序列,一般# 每隔4帧采样1帧#------------------------------------------------#for video in train:self.process_video(video, file, train_dir)for video in val:self.process_video(video, file, val_dir)for video in test:self.process_video(video, file, test_dir)print('Preprocessing finished.')#------------------------------------------------------------## 将视频切分为图片序列#------------------------------------------------------------#def process_video(self, video, action_name, save_dir):# Initialize a VideoCapture object to read video data into a numpy arrayvideo_filename = video.split('.')[0]if not os.path.exists(os.path.join(save_dir, video_filename)):os.mkdir(os.path.join(save_dir, video_filename))# ------------------------------------------------------------## 读取视频,并返回视频的帧数量、画面宽度和高度# ------------------------------------------------------------#capture = cv2.VideoCapture(os.path.join(self.root_dir, action_name, video))frame_count = int(capture.get(cv2.CAP_PROP_FRAME_COUNT))frame_width = int(capture.get(cv2.CAP_PROP_FRAME_WIDTH))frame_height = int(capture.get(cv2.CAP_PROP_FRAME_HEIGHT))#------------------------------------------------------## Make sure splited video has at least 16 frames# 相邻的帧比较相似,不是特征有必要,因此每隔4帧采样一帧,但是需要# 保证输入图片序列至少有16帧,对采样不满16帧的视频,降低间隔#------------------------------------------------------#EXTRACT_FREQUENCY = 4if frame_count // EXTRACT_FREQUENCY <= 16:EXTRACT_FREQUENCY -= 1if frame_count // EXTRACT_FREQUENCY <= 16:EXTRACT_FREQUENCY -= 1if frame_count // EXTRACT_FREQUENCY <= 16:EXTRACT_FREQUENCY -= 1count = 0i = 0retaining = Truewhile (count < frame_count and retaining):retaining, frame = capture.read()if frame is None:continue# 将图片resize到指定大小if count % EXTRACT_FREQUENCY == 0:if (frame_height != self.resize_height) or (frame_width != self.resize_width):frame = cv2.resize(frame, (self.resize_width, self.resize_height))cv2.imwrite(filename=os.path.join(save_dir, video_filename, '0000{}.jpg'.format(str(i))), img=frame)i += 1count += 1# Release the VideoCapture once it is no longer neededcapture.release()def randomflip(self, buffer):"""Horizontally flip the given image and ground truth randomly with a probability of 0.5."""if np.random.random() < 0.5:for i, frame in enumerate(buffer):frame = cv2.flip(buffer[i], flipCode=1)buffer[i] = cv2.flip(frame, flipCode=1)return bufferdef normalize(self, buffer):for i, frame in enumerate(buffer):frame -= np.array([[[90.0, 98.0, 102.0]]])buffer[i] = framereturn bufferdef to_tensor(self, buffer):return buffer.transpose((3, 0, 1, 2))#----------------------------------------------------------## 读取每一帧的图像,并返回到缓存中#----------------------------------------------------------#def load_frames(self, file_dir):frames = sorted([os.path.join(file_dir, img) for img in os.listdir(file_dir)])frame_count = len(frames)buffer = np.empty((frame_count, self.resize_height, self.resize_width, 3), np.dtype('float32'))for i, frame_name in enumerate(frames):frame = np.array(cv2.imread(frame_name)).astype(np.float64)buffer[i] = framereturn buffer#---------------------------------------------------------------## 随机选取一个index值,截取16帧的图片序列作为输入,同时对图片进行截取,最终返回# 3,16,112,112的图片序列作为输入#---------------------------------------------------------------#def crop(self, buffer, clip_len, crop_size):# randomly select time index for temporal jitteringtime_index = np.random.randint(buffer.shape[0] - clip_len)# Randomly select start indices in order to crop the videoheight_index = np.random.randint(buffer.shape[1] - crop_size)width_index = np.random.randint(buffer.shape[2] - crop_size)# Crop and jitter the video using indexing. The spatial crop is performed on# the entire array, so each frame is cropped in the same location. The temporal# jitter takes place via the selection of consecutive framesbuffer = buffer[time_index:time_index + clip_len,height_index:height_index + crop_size,width_index:width_index + crop_size, :]return buffer

4.网络结构

整个网络结构模型与论文描述一致，输入尺寸为3×16×128×171。在训练期间，使用3×16×112×112的随机裁剪进行抖动。网络有5个卷积层和5个池化层（每个卷积层紧是一个池化层），2个全连接层和一个softmax损失层来预测动作标签。从1层到5层的5个卷积层的滤波器数分别为64、128、256、256、256。所有卷积核的大小都为d，其中d是核的时间深度。所有这些卷积层都应用了适当的填充（空间和时间）和步幅1，因此从这些卷积层的输入到输出的大小没有变化。所有的池化层都是最大池化，内核大小为2×2×2（第一层除外），步幅为1，这意味着输出信号的大小比输入信号减少了8倍。第一个池化层的内核大小为1×2×2，目的是不过早合并时间信号，并满足16帧的序列长度

代码如下:

class C3D(nn.Module):"""The C3D network."""def __init__(self, num_classes, pretrained=False):super(C3D, self).__init__()self.conv1 = nn.Conv3d(3, 64, kernel_size=(3, 3, 3), padding=(1, 1, 1))self.pool1 = nn.MaxPool3d(kernel_size=(1, 2, 2), stride=(1, 2, 2))self.conv2 = nn.Conv3d(64, 128, kernel_size=(3, 3, 3), padding=(1, 1, 1))self.pool2 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2))self.conv3a = nn.Conv3d(128, 256, kernel_size=(3, 3, 3), padding=(1, 1, 1))self.conv3b = nn.Conv3d(256, 256, kernel_size=(3, 3, 3), padding=(1, 1, 1))self.pool3 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2))self.conv4a = nn.Conv3d(256, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1))self.conv4b = nn.Conv3d(512, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1))self.pool4 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2))self.conv5a = nn.Conv3d(512, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1))self.conv5b = nn.Conv3d(512, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1))self.pool5 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2), padding=(0, 1, 1))self.fc6 = nn.Linear(8192, 4096)self.fc7 = nn.Linear(4096, 4096)self.fc8 = nn.Linear(4096, num_classes)self.dropout = nn.Dropout(p=0.5)self.relu = nn.ReLU()self.__init_weight()if pretrained:self.__load_pretrained_weights()def forward(self, x):#print ('1:',x.size())x = self.relu(self.conv1(x))#print ('2:',x.size())x = self.pool1(x)#print ('3:',x.size())x = self.relu(self.conv2(x))#print ('4:',x.size())x = self.pool2(x)#print ('5:',x.size())x = self.relu(self.conv3a(x))#print ('6:',x.size())x = self.relu(self.conv3b(x))#print ('7:',x.size())x = self.pool3(x)#print ('8:',x.size())x = self.relu(self.conv4a(x))#print ('9:',x.size())x = self.relu(self.conv4b(x))#print ('10:',x.size())x = self.pool4(x)#print ('11:',x.size())x = self.relu(self.conv5a(x))#print ('12:',x.size())x = self.relu(self.conv5b(x))#print ('13:',x.size())x = self.pool5(x)#print ('14:',x.size())x = x.view(-1, 8192)#print ('15:',x.size())x = self.relu(self.fc6(x))#print ('16:',x.size())x = self.dropout(x)x = self.relu(self.fc7(x))x = self.dropout(x)logits = self.fc8(x)#print ('17:',logits.size())return logitsdef __load_pretrained_weights(self):"""Initialiaze network."""corresp_name = {# Conv1"features.0.weight": "conv1.weight","features.0.bias": "conv1.bias",# Conv2"features.3.weight": "conv2.weight","features.3.bias": "conv2.bias",# Conv3a"features.6.weight": "conv3a.weight","features.6.bias": "conv3a.bias",# Conv3b"features.8.weight": "conv3b.weight","features.8.bias": "conv3b.bias",# Conv4a"features.11.weight": "conv4a.weight","features.11.bias": "conv4a.bias",# Conv4b"features.13.weight": "conv4b.weight","features.13.bias": "conv4b.bias",# Conv5a"features.16.weight": "conv5a.weight","features.16.bias": "conv5a.bias",# Conv5b"features.18.weight": "conv5b.weight","features.18.bias": "conv5b.bias",# fc6"classifier.0.weight": "fc6.weight","classifier.0.bias": "fc6.bias",# fc7"classifier.3.weight": "fc7.weight","classifier.3.bias": "fc7.bias",}p_dict = torch.load(Path.model_dir())s_dict = self.state_dict()for name in p_dict:if name not in corresp_name:continues_dict[corresp_name[name]] = p_dict[name]self.load_state_dict(s_dict)def __init_weight(self):for m in self.modules():if isinstance(m, nn.Conv3d):# n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels# m.weight.data.normal_(0, math.sqrt(2. / n))torch.nn.init.kaiming_normal_(m.weight)elif isinstance(m, nn.BatchNorm3d):m.weight.data.fill_(1)m.bias.data.zero_()