UNINEXT
2024-01-25 10:21:18 0 举报
AI智能生成
UNINEXT inference code pipeline
作者其他创作
大纲/内容
VisualizationDemo
self.predictor = UNINEXTPredictor(cfg)
self.model = UNINEXT_IMG(cfg)
transform = DeformableTransformerVLDINO
self.d_model = 256
self.head = 8
self.two_stage = True
self.tow_stage_num_proposals = 900
self.encoder = DeformableTransformerEncoderVL
vl_fusion_layer = VLFuse(cfg)
encoder_layer = DeformableTransformerEncoderLayer
lang_encoder_layer = Indentity()
num_encoder_layers = 8
num_vl_layers = 1
self.decoder = DeformableTransformerDecoder
d_model = 256
decoder_layer = DeformableTransformerDecoderLayer
num_decoder_layers = 6
return_intermediate_dec = True
look_forward_twice = True
use_checkpoint = False
self.level_embed = nn.Parameter(torch.Tensor(num_feature_levels = 4, d_model = 256))
self.tgt_embed = nn.Embedding(self.two_stage_num_proposals = 900, d_model = 256)
two_stage
self.enc_output = nn.Linear(d_model = 256, d_model = 256)
self.enc_output_norm = nn.LayerNorm(d_model = 256)
self.resizer = FeatureResizer(input_feat_size = 768, output_feat_size = 256, dropout = 0.1)
self.mxed_selection = True
self.decouple_tgt = True
self.still_tgt_for_both = True
model = detr_class = DeformableDETRDINO
self.num_queries = 900
self.transformer = transformer = DeformableTransformerVLDINO
hidden_dim = 256
self.class_embed = VL_Align(cfg)
self.bbox_embed = MLP(hidden_dim = 256, hidden_dim = 256, 4, 3)
self.iou_head = nn.Linear(256, 1)
self.num_feature_levels = 4
self.decouple_tgt = True
根据不同尺度的特征图,设置input_proj,放入到input_proj_list中
for _ in range(num_backbone_outs):
in_channels = backbone.num_channels[_]
input_proj_list.append(nn.Sequential(
nn.Conv2d(in_channels, hidden_dim, kernel_size=1),
nn.GroupNorm(32, hidden_dim),
))
in_channels = backbone.num_channels[_]
input_proj_list.append(nn.Sequential(
nn.Conv2d(in_channels, hidden_dim, kernel_size=1),
nn.GroupNorm(32, hidden_dim),
))
如果设置的num_feature_levels大于提取到的multiscale img feature的个数,根据差额的数目再增加input_proj,放入到input_proj_list中
for _ in range(num_feature_levels - num_backbone_outs):
input_proj_list.append(nn.Sequential(
nn.Conv2d(in_channels, hidden_dim, kernel_size=3, stride=2, padding=1),
nn.GroupNorm(32, hidden_dim),
))
in_channels = hidden_dim
input_proj_list.append(nn.Sequential(
nn.Conv2d(in_channels, hidden_dim, kernel_size=3, stride=2, padding=1),
nn.GroupNorm(32, hidden_dim),
))
in_channels = hidden_dim
self.backbone = backbone = MaskedBackbone
self.aux_loss = True
self.with_box_refine = True
with_box_refine
self.class_embed = _get_clones(self.class_embed, num_pred)
self.bbox_embed = _get_clones(self.bbox_embed, num_pred)
cfg.MODEL_USE_IOU_BRANCH
self.iou_head = _get_clones(self.iou_head, num_pred-1)
self.transformer.decoder.bbox_embed = self.bbox_embed
self.two_stage = True
self.transformer.decoder.class_embed = self.class_embed
cfg.MODEL.STILL_CLS_FOR_ENCODER
self.transformer.decoder.class_embed[-1] = Still_Classifier(hidden_dim)
num_pred = (transformer.decoder.num_layers + 1) if two_stage else transformer.decoder.num_layers
如果是two_stage模型,那么将每个decoder的输出+1作为num_pred数目
self.mixed_selection = mixed_selection = True
self.detr = model_class = DDETRSegmUniDN
self.detr = detr = DeformableDETRDINO
self.ref_coord = ref_coord = True
self.ota = ota = True
self.decouple_tgt = decouple_tgt = True
self.cls_pool_type = 'average'
self.use_iou_branch = True
self.new_mask_head = False
self.use_raft = False
self.in_channels = hidden_dim // 32 = 256 // 32 = 8
self.dynamic_mask_channels = 8
self.controller_layers = 3
self.max_insts_num = 100
self.mask_out_stride = mask_out_stride = 4
self.up_rate = 8 // self.mask_out_stride = 2
self.weght_nums = [80, 64, 8]
self.bias_nums = [8, 8, 1]
self.num_gen_params = sum(weight_nums) + sum(bias_nums) = 169
self.controller = MLP(hidden_dim, hidden_dim, self.num_gen_params, 3)
self.mask_head = MaskHeadSmallConv(256, None, 256, use_raft = False, up_rate = self.up_rate)
self.dynamtic_label_enc
self.resizer = FeatureResizer(input_feature_size = 768, output_feat_size = self.embed_dim = 256, dropout = 0.0)
matcher = HungarianMatcherVL(cost_class = 2, cost_bbox = 5, cost_giou = 2)
losses = ['labelsVL', 'boxes', 'masks']
self.criterion = DINOCriterion(matcher, weight_dict, losses)
self.aug = T.ResizeShortestEdge
demo.run_on_image
UNINEXTPredictor.forward
UNINEXT_IMG([inputs])
predictions = self.model([inputs])[0]
grounding
将输入的expression转换为features
self.detr.coco_inference
features, pos = self.detr.backbone(samples)
MaskedBackbone(sample)
xs = self.backbone(tensor_list.tensors)
将图片输入到ResNet中提取特征
3个尺度
res3: (1, 512, 100, 151)
res4: (1, 1024, 50, 76)
res5: (1. 2048, 25, 38)
根据xs的multiscale size 生成对应的multiscale size mask
(1, 100, 151)
(1, 50, 76)
(1, 25, 38)
PositionEmbeddingSine
根据不同大小的特征图生成对应的position embedding
(1, 256, 100, 151)
(1, 256, 50, 76)
(1, 256, 25, 38)
遍历得到的features,利用self.detr.input_proj[index](src)改变特征维度
(1, 512, 100, 151) -> (1, 256, 100, 151)
(1, 1024, 50, 76) -> (1, 256, 50, 76)
(1, 2048, 25, 38) -> (1, 256, 25, 38)
如果feature imgs的数量不够num_feature_levels,需要增加一个更小的feature, mask, pos
(1, 256, 13, 19)
query_embeds = (None, None)
self.detr.transformer(srcs, masks, poses, query_embeds)
self.detr.transformer = DeformableTransformerVLDINO
src = src.flatten(2).transpose(1, 2)
(bs, C, H, W) -> (bs, H * W, C)
mask = mask.flatten(1)
(bs, H, W)
pos_embed = pos_embed.flatten(2).transpose(1, 2)
(bs, C, H, W) -> (bs, H * W, C)
src_flatten = torch.cat(src_flatten, 2)
(1, 20097, 256)
mask_flatten = torch.cat(mask_flatten, 1)
(1, 20097)
lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1)
(1, 20097, 256)
spatial_shapes
tensor([[100, 151],
[ 50, 76],
[ 25, 38],
[ 13, 19]], device='cuda:0')
[ 50, 76],
[ 25, 38],
[ 13, 19]], device='cuda:0')
level_start_index
tensor([ 0, 15100, 18900, 19850], device='cuda:0')
vl_feats_dict = self.encoder(src_flatten, spatial_shapes, level_start_index, valid_ratios, lvl_pos_embed_flatten, mask_flatten, language_dict_features)
get_reference_points
根据4个不同大小的width和height,每隔0.5取一个点,对应生成x,y的坐标
(1,20097, 4, 2)
BiMultiHeadAttention
将visual feature和language feature混合
output["visual"] = layer(output["visual"], pos, reference_points, spatial_shapes, level_start_index, padding_mask)
DeformableTransformerEncoderLayer
src2 = self.self_attn(self.with_pos_embed(src, pos), reference_points, src, spatial_shapes, level_start_index, padding_mask)
self_attn = MSDeformAttn
query = self.with_pos_embed(src, pos)
(1, 20097, 256)
reference_points
(1,20097, 4, 2)
input_flatten
(1, 20097, 256)
input_spatial_shapes
[[100, 51], [50, 76], [25, 38], [13, 19]]
input_level_start_index
[0, 15100, 18900, 19850]
value = self.value_proj(input_flatten)
value: (1, 20097, 256)
value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads)
value: (1, 20097,8, 32)
sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points, 2)
self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2)
将输入的query,通过nn.Linear映射成offset
8个头 * 4个multiscale level * 每个level 像素点 4个points * (x, y)
(1, 20097, 8, 4, 4, 2)
attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points)
attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points)
(1, 20097, 8, 4, 4,)
encoder的时候reference_points是以x, y的形式表示的
对应的sampling_offsets是根据query通过linear映射来的,所以需要除以长和宽归一化
offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1)
sampling_locations = reference_points[:, :, None, :, None, :] \
+ sampling_offsets / offset_normalizer[None, None, None, :, None, :]
sampling_locations = reference_points[:, :, None, :, None, :] \
+ sampling_offsets / offset_normalizer[None, None, None, :, None, :]
根据value,sample_locations, attention_weight计算出注意力,然后进行out_proj
(1, 20097, 256)
从vl_feats_dict取出fusion之后的feature
lang_feat_pool
(1, 768)
通过self.resizer转换维度
(1, 256)
output_memory, output_proposals = self.gen_encoder_output_proposals(memory, mask_flatten, spatial_shapes)
利用encdoer计算出的memory,进行变换生成对应的propsals和memory
根据mask_flatten计算出一下output_proposal;
根据memory计算再映射一下得到output_memory
enc_outputs_class = self.decoder.class_embed[self.decoder.num_layers](output_memory, lang_feat_pool.unsqueeze(1))
利用decoder.class_embed中的StillClassifier得到enc_outputs_class
传入的是output_memory
(1, 20097, 1)
enc_outputs_coord_unact = self.decoder.bbox_embed[self.decoder.num_layers](output_memory) + output_proposals
使用decoder.bbox_embed的最后一层获得对应的coord
传入的是output_memory和output_proposals
(1, 20097, 4)
topk_proposals = torch.topk(enc_outputs_class[..., 0], topk, dim=1)[1]
topk_coords_unact = torch.gather(enc_outputs_coord_unact, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, 4))
topk_coords_unact = torch.gather(enc_outputs_coord_unact, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, 4))
利用enc_outputs_class取出topk中对应的索引值,然后利用获取到的索引值从enc_outputs_coord_unact获取到对应的坐标
topk_coords_unact:(1, 20097, 4)
reference_points = topk_coords_unact.sigmoid()
tgt = self.tgt_embed.weight[None].repeat(bs, 1, 1)
这行代码的目的是生成目标(target)的嵌入向量。在这里,self.tgt_embed是一个嵌入层,其权重矩阵的每一行都可以看作是一个特定目标的嵌入向量。通过self.tgt_embed.weight,我们可以获取这个嵌入矩阵。 然后,[None]是用来增加一个新的维度,使得权重矩阵的形状从(num_proposals, d_model)变为(1, num_proposals, d_model)。 最后,repeat(bs, 1, 1)是将这个嵌入矩阵复制bs次,生成一个形状为(bs, num_proposals, d_model)的张量,其中bs是批量大小。这样,每个样本在批次中都有自己的目标嵌入向量。 总的来说,这行代码的目的是为每个样本生成一组目标嵌入向量,这些向量将被用于后续的解码过程。
if ref_feat is not None:
if self.decouple_tgt:
if self.still_tgt_for_both:
tgt_new = tgt + 0.0 * ref_feat # both use the original tgt
else:
if task == "detection":
tgt_new = tgt + 0.0 * ref_feat # "+ 0.0 *" is to avoid unused parameters
elif task == "grounding":
tgt_new = ref_feat + 0.0 * tgt # "+ 0.0 *" is to avoid unused parameters
else:
raise ValueError("task should be detection or grounding")
else:
if query_embed[0] is None:
tgt_new = ref_feat.repeat(1, self.two_stage_num_proposals, 1)
else:
tgt_new = torch.cat([query_embed[0], ref_feat.repeat(1, self.two_stage_num_proposals, 1)], 1)
# avoid unused parameters
tgt_new += 0.0 * torch.sum(self.tgt_embed.weight)
if self.decouple_tgt:
if self.still_tgt_for_both:
tgt_new = tgt + 0.0 * ref_feat # both use the original tgt
else:
if task == "detection":
tgt_new = tgt + 0.0 * ref_feat # "+ 0.0 *" is to avoid unused parameters
elif task == "grounding":
tgt_new = ref_feat + 0.0 * tgt # "+ 0.0 *" is to avoid unused parameters
else:
raise ValueError("task should be detection or grounding")
else:
if query_embed[0] is None:
tgt_new = ref_feat.repeat(1, self.two_stage_num_proposals, 1)
else:
tgt_new = torch.cat([query_embed[0], ref_feat.repeat(1, self.two_stage_num_proposals, 1)], 1)
# avoid unused parameters
tgt_new += 0.0 * torch.sum(self.tgt_embed.weight)
根据之前定义的tgt(embedding)和文本特征得到对应的tgt_new
hs, inter_references = self.decoder(tgt_new, reference_points, memory,
spatial_shapes, level_start_index, valid_ratios,
query_pos=None, src_padding_mask=mask_flatten,
attn_masks=attn_masks)
spatial_shapes, level_start_index, valid_ratios,
query_pos=None, src_padding_mask=mask_flatten,
attn_masks=attn_masks)
DeformableTransformerDecoder
output = layer(output, query_pos, reference_points_input, src, src_spatial_shapes, src_level_start_index, src_padding_mask, attn_masks)
output
上一步得到的tgt_new
(1, 900, 256)
query_pos
position vector
(1, 900, 256)
reference_points_input
经过reference_point得到
(1, 900, 4) -> (1, 900, 4, 4)
src
将image输入到encoder之后然后同language feature进行early fusion得到的结果memory
(1, 20097, 256)
src_spatial_shapes
[[100, 151], [50, 76], [25, 38], [13, 19]]
level_start_index
tensor([ 0, 15100, 18900, 19850], device='cuda:0')
先计算self attention
q = k = self.with_pos_embed(tgt, query_pos)
tgt2 = self.self_attn(q.transpose(0, 1), k.transpose(0, 1), tgt.transpose(0, 1), attn_mask=attn_masks)[0].transpose(0, 1)
tgt = tgt + self.dropout2(tgt2)
tgt = self.norm2(tgt)
tgt2 = self.self_attn(q.transpose(0, 1), k.transpose(0, 1), tgt.transpose(0, 1), attn_mask=attn_masks)[0].transpose(0, 1)
tgt = tgt + self.dropout2(tgt2)
tgt = self.norm2(tgt)
计算cross attention
如果使用了bouding box refinement,就把当前layer输出的output送入到self.bbox_embed[index]中计算出对应tmp,更新reference_points
if self.bbox_embed is not None:
tmp = self.bbox_embed[lid](output)
if reference_points.shape[-1] == 4:
new_reference_points = tmp + inverse_sigmoid(reference_points)
new_reference_points = new_reference_points.sigmoid()
else:
assert reference_points.shape[-1] == 2
new_reference_points = tmp
new_reference_points[..., :2] = tmp[..., :2] + inverse_sigmoid(reference_points)
new_reference_points = new_reference_points.sigmoid()
reference_points = new_reference_points.detach() # new reference points in the next decoder layer
tmp = self.bbox_embed[lid](output)
if reference_points.shape[-1] == 4:
new_reference_points = tmp + inverse_sigmoid(reference_points)
new_reference_points = new_reference_points.sigmoid()
else:
assert reference_points.shape[-1] == 2
new_reference_points = tmp
new_reference_points[..., :2] = tmp[..., :2] + inverse_sigmoid(reference_points)
new_reference_points = new_reference_points.sigmoid()
reference_points = new_reference_points.detach() # new reference points in the next decoder layer
hs就是hidden state
(6, 1, 900, 256)
inter_reference就是输出的预测的位置
(1, 900, 4)
enc_output_class = (1, 20097, 1)

收藏
0 条评论
下一页
为你推荐
查看更多