核心内容摘要
坚韧的翅膀:单亲妈妈的“小马大车”人生路
Pytorch实现Vision Transformerimporttorchimporttorch.nnasnnclassPatchEmbedding(nn.Module):def__init__(self,img_size224,patch_size16,in_channels3,embed_dim
:super().__init__()self.img_sizeimg_size self.patch_sizepatch_size self.n_patches(img_size//patch_size)**2# 使用卷积层实现patch分割和嵌入self.projnn.Conv2d(in_channelsin_channels,out_channelsembed_dim,kernel_sizepatch_size,stridepatch_size)defforward(self,x):# 输入x形状: [batch_size, in_channels, img_size, img_size]# 输出形状: [batch_size, n_patches, embed_dim]xself.proj(x)# [batch_size, embed_dim, n_patches^
5, n_patches^
5]xx.flatten(
# [batch_size, embed_dim, n_patches]xx.transpose(1,
# [batch_size, n_patches, embed_dim]returnxclassPositionEmbedding(nn.Module):def__init__(self,n_patches,embed_dim,dropout
0.
:super().__init__()self.pos_embednn.Parameter(torch.zeros(1,n_patches1,embed_dim))# 1 for class tokenself.dropoutnn.Dropout(dropout)defforward(self,x):# x形状: [batch_size, n_patches1, embed_dim]xxself.pos_embed# 添加位置编码xself.dropout(x)returnxclassMultiHeadAttention(nn.Module):def__init__(self,embed_dim,num_heads,dropout
0.
:super().__init__()self.embed_dimembed_dim self.num_headsnum_heads self.head_dimembed_dim//num_headsassertself.head_dim*num_headsembed_dim,Embedding dimension must be divisible by number of headsself.qkvnn.Linear(embed_dim,embed_dim*
# 同时计算Q,K,Vself.attn_dropoutnn.Dropout(dropout)self.projnn.Linear(embed_dim,embed_dim)self.proj_dropoutnn.Dropout(dropout)self.scaleself.head_dim**-
5defforward(self,x):batch_size,n_patches,embed_dimx.shape# 计算Q,K,V [batch_size, n_patches, num_heads, head_dim]qkvself.qkv(x).reshape(batch_size,n_patches,3,self.num_heads,self.head_dim).permute(2,0,3,1,
q,k,vqkv[0],qkv[1],qkv[2]# 计算注意力分数 [batch_size, num_heads, n_patches, n_patches]attn(q k.transpose(-2,-
)*self.scale attnattn.softmax(dim-
attnself.attn_dropout(attn)# 应用注意力权重到V上 [batch_size, num_heads, n_patches, head_dim]outattn v outout.transpose(1,
.reshape(batch_size,n_patches,embed_dim)# 线性投影和dropoutoutself.proj(out)outself.proj_dropout(out)returnoutclassMLP(nn.Module):def__init__(self,in_features,hidden_features,out_features,dropout
0.
:super().__init__()self.fc1nn.Linear(in_features,hidden_features)self.actnn.GELU()self.fc2nn.Linear(hidden_features,out_features)self.dropoutnn.Dropout(dropout)defforward(self,x):xself.fc1(x)xself.act(x)xself.dropout(x)xself.fc2(x)xself.dropout(x)returnxclassTransformerBlock(nn.Module):def__init__(self,embed_dim,num_heads,mlp_ratio4,dropout
0.
:super().__init__()self.norm1nn.LayerNorm(embed_dim)self.attnMultiHeadAttention(embed_dim,num_heads,dropout)self.norm2nn.LayerNorm(embed_dim)self.mlpMLP(in_featuresembed_dim,hidden_featuresembed_dim*mlp_ratio,out_featuresembed_dim,dropoutdropout)defforward(self,x):# 残差连接和层归一化xxself.attn(self.norm1(x))xxself.mlp(self.norm2(x))returnxclassVisionTransformer(nn.Module):def__init__(self,img_size224,patch_size16,in_channels3,n_classes1000,embed_dim768,depth12,num_heads12,mlp_ratio4,dropout
0.
:super().__init__()self.patch_embedPatchEmbedding(img_size,patch_size,in_channels,embed_dim)n_patchesself.patch_embed.n_patches# 分类token和位置编码self.cls_tokennn.Parameter(torch.zeros(1,1,embed_dim))self.pos_embedPositionEmbedding(n_patches,embed_dim,dropout)# Transformer编码器self.blocksnn.Sequential(*[TransformerBlock(embed_dim,num_heads,mlp_ratio,dropout)for_inrange(depth)])# 分类头self.normnn.LayerNorm(embed_dim)self.headnn.Linear(embed_dim,n_classes)# 初始化权重nn.init.trunc_normal_(self.cls_token,std
0.