internimage_t_1k_224 / modeling_internimage.py

Upload models

5bbdaf8 verified 11 months ago

34.7 kB

	# --------------------------------------------------------
	# InternImage
	# Copyright (c) 2025 OpenGVLab
	# Licensed under The MIT License [see LICENSE for details]
	# --------------------------------------------------------

	from dataclasses import dataclass
	from typing import Optional, Tuple

	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	import torch.utils.checkpoint as checkpoint
	from timm.models.layers import DropPath, trunc_normal_
	from torch import nn
	from transformers import PreTrainedModel
	from transformers.modeling_outputs import ModelOutput

	from .configuration_internimage import InternImageConfig
	from .dcnv3 import DCNv3, DCNv3_pytorch, has_cuda_kernel
	from .dcnv3_func import dcnv3_core_pytorch


	@dataclass
	class BackboneOutput(ModelOutput):
	"""
	Base class for outputs of backbones.
	"""

	hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
	pooler_output: Optional[torch.FloatTensor] = None
	last_hidden_state: Optional[torch.FloatTensor] = None
	logits: Optional[torch.FloatTensor] = None
	loss: Optional[torch.FloatTensor] = None


	class to_channels_first(nn.Module):

	def __init__(self):
	super().__init__()

	def forward(self, x):
	return x.permute(0, 3, 1, 2)


	class to_channels_last(nn.Module):

	def __init__(self):
	super().__init__()

	def forward(self, x):
	return x.permute(0, 2, 3, 1)


	def build_norm_layer(dim,
	norm_layer,
	in_format='channels_last',
	out_format='channels_last',
	eps=1e-6):
	layers = []
	if norm_layer == 'BN':
	if in_format == 'channels_last':
	layers.append(to_channels_first())
	layers.append(nn.BatchNorm2d(dim))
	if out_format == 'channels_last':
	layers.append(to_channels_last())
	elif norm_layer == 'LN':
	if in_format == 'channels_first':
	layers.append(to_channels_last())
	layers.append(nn.LayerNorm(dim, eps=eps))
	if out_format == 'channels_first':
	layers.append(to_channels_first())
	else:
	raise NotImplementedError(
	f'build_norm_layer does not support {norm_layer}')
	return nn.Sequential(*layers)


	def build_act_layer(act_layer):
	if act_layer == 'ReLU':
	return nn.ReLU(inplace=True)
	elif act_layer == 'SiLU':
	return nn.SiLU(inplace=True)
	elif act_layer == 'GELU':
	return nn.GELU()

	raise NotImplementedError(f'build_act_layer does not support {act_layer}')


	class CrossAttention(nn.Module):
	r""" Cross Attention Module
	Args:
	dim (int): Number of input channels.
	num_heads (int): Number of attention heads. Default: 8
	qkv_bias (bool, optional): If True, add a learnable bias to q, k, v.
	Default: False.
	qk_scale (float \| None, optional): Override default qk scale of
	head_dim ** -0.5 if set. Default: None.
	attn_drop (float, optional): Dropout ratio of attention weight.
	Default: 0.0
	proj_drop (float, optional): Dropout ratio of output. Default: 0.0
	attn_head_dim (int, optional): Dimension of attention head.
	out_dim (int, optional): Dimension of output.
	"""

	def __init__(self,
	dim,
	num_heads=8,
	qkv_bias=False,
	qk_scale=None,
	attn_drop=0.,
	proj_drop=0.,
	attn_head_dim=None,
	out_dim=None):
	super().__init__()
	if out_dim is None:
	out_dim = dim
	self.num_heads = num_heads
	head_dim = dim // num_heads
	if attn_head_dim is not None:
	head_dim = attn_head_dim
	all_head_dim = head_dim * self.num_heads
	self.scale = qk_scale or head_dim ** -0.5
	assert all_head_dim == dim

	self.q = nn.Linear(dim, all_head_dim, bias=False)
	self.k = nn.Linear(dim, all_head_dim, bias=False)
	self.v = nn.Linear(dim, all_head_dim, bias=False)

	if qkv_bias:
	self.q_bias = nn.Parameter(torch.zeros(all_head_dim))
	self.k_bias = nn.Parameter(torch.zeros(all_head_dim))
	self.v_bias = nn.Parameter(torch.zeros(all_head_dim))
	else:
	self.q_bias = None
	self.k_bias = None
	self.v_bias = None

	self.attn_drop = nn.Dropout(attn_drop)
	self.proj = nn.Linear(all_head_dim, out_dim)
	self.proj_drop = nn.Dropout(proj_drop)

	def forward(self, x, k=None, v=None):
	B, N, C = x.shape
	N_k = k.shape[1]
	N_v = v.shape[1]

	q_bias, k_bias, v_bias = None, None, None
	if self.q_bias is not None:
	q_bias = self.q_bias
	k_bias = self.k_bias
	v_bias = self.v_bias

	q = F.linear(input=x, weight=self.q.weight, bias=q_bias)
	q = q.reshape(B, N, 1, self.num_heads,
	-1).permute(2, 0, 3, 1,
	4).squeeze(0) # (B, N_head, N_q, dim)

	k = F.linear(input=k, weight=self.k.weight, bias=k_bias)
	k = k.reshape(B, N_k, 1, self.num_heads, -1).permute(2, 0, 3, 1,
	4).squeeze(0)

	v = F.linear(input=v, weight=self.v.weight, bias=v_bias)
	v = v.reshape(B, N_v, 1, self.num_heads, -1).permute(2, 0, 3, 1,
	4).squeeze(0)

	q = q * self.scale
	attn = (q @ k.transpose(-2, -1)) # (B, N_head, N_q, N_k)

	attn = attn.softmax(dim=-1)
	attn = self.attn_drop(attn)

	x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
	x = self.proj(x)
	x = self.proj_drop(x)

	return x


	class AttentiveBlock(nn.Module):
	r"""Attentive Block
	Args:
	dim (int): Number of input channels.
	num_heads (int): Number of attention heads. Default: 8
	qkv_bias (bool, optional): If True, add a learnable bias to q, k, v.
	Default: False.
	qk_scale (float \| None, optional): Override default qk scale of
	head_dim ** -0.5 if set. Default: None.
	drop (float, optional): Dropout rate. Default: 0.0.
	attn_drop (float, optional): Attention dropout rate. Default: 0.0.
	drop_path (float \| tuple[float], optional): Stochastic depth rate.
	Default: 0.0.
	norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm.
	attn_head_dim (int, optional): Dimension of attention head. Default: None.
	out_dim (int, optional): Dimension of output. Default: None.
	"""

	def __init__(self,
	dim,
	num_heads,
	qkv_bias=False,
	qk_scale=None,
	drop=0.,
	attn_drop=0.,
	drop_path=0.,
	norm_layer='LN',
	attn_head_dim=None,
	out_dim=None):
	super().__init__()

	self.norm1_q = build_norm_layer(dim, norm_layer, eps=1e-6)
	self.norm1_k = build_norm_layer(dim, norm_layer, eps=1e-6)
	self.norm1_v = build_norm_layer(dim, norm_layer, eps=1e-6)
	self.cross_dcn = CrossAttention(dim,
	num_heads=num_heads,
	qkv_bias=qkv_bias,
	qk_scale=qk_scale,
	attn_drop=attn_drop,
	proj_drop=drop,
	attn_head_dim=attn_head_dim,
	out_dim=out_dim)

	self.drop_path = DropPath(
	drop_path) if drop_path > 0. else nn.Identity()

	def forward(self,
	x_q,
	x_kv,
	pos_q,
	pos_k,
	bool_masked_pos,
	rel_pos_bias=None):
	x_q = self.norm1_q(x_q + pos_q)
	x_k = self.norm1_k(x_kv + pos_k)
	x_v = self.norm1_v(x_kv)

	x = self.cross_dcn(x_q, k=x_k, v=x_v)

	return x


	class AttentionPoolingBlock(AttentiveBlock):

	def forward(self, x):
	x_q = x.mean(1, keepdim=True)
	x_kv = x
	pos_q, pos_k = 0, 0
	x = super().forward(x_q, x_kv, pos_q, pos_k,
	bool_masked_pos=None,
	rel_pos_bias=None)
	x = x.squeeze(1)
	return x


	class StemLayer(nn.Module):
	r"""Stem layer of InternImage
	Args:
	in_chans (int): number of input channels
	out_chans (int): number of output channels
	act_layer (str): activation layer
	norm_layer (str): normalization layer
	"""

	def __init__(self,
	in_chans=3,
	out_chans=96,
	act_layer='GELU',
	norm_layer='BN'):
	super().__init__()
	self.conv1 = nn.Conv2d(in_chans,
	out_chans // 2,
	kernel_size=3,
	stride=2,
	padding=1)
	self.norm1 = build_norm_layer(out_chans // 2, norm_layer,
	'channels_first', 'channels_first')
	self.act = build_act_layer(act_layer)
	self.conv2 = nn.Conv2d(out_chans // 2,
	out_chans,
	kernel_size=3,
	stride=2,
	padding=1)
	self.norm2 = build_norm_layer(out_chans, norm_layer, 'channels_first',
	'channels_last')

	def forward(self, x):
	x = self.conv1(x)
	x = self.norm1(x)
	x = self.act(x)
	x = self.conv2(x)
	x = self.norm2(x)
	return x


	class DownsampleLayer(nn.Module):
	r"""Downsample layer of InternImage
	Args:
	channels (int): number of input channels
	norm_layer (str): normalization layer
	"""

	def __init__(self, channels, norm_layer='LN'):
	super().__init__()
	self.conv = nn.Conv2d(channels,
	2 * channels,
	kernel_size=3,
	stride=2,
	padding=1,
	bias=False)
	self.norm = build_norm_layer(2 * channels, norm_layer,
	'channels_first', 'channels_last')

	def forward(self, x):
	x = self.conv(x.permute(0, 3, 1, 2))
	x = self.norm(x)
	return x


	class MLPLayer(nn.Module):
	r"""MLP layer of InternImage
	Args:
	in_features (int): number of input features
	hidden_features (int): number of hidden features
	out_features (int): number of output features
	act_layer (str): activation layer
	drop (float): dropout rate
	"""

	def __init__(self,
	in_features,
	hidden_features=None,
	out_features=None,
	act_layer='GELU',
	drop=0.):
	super().__init__()
	out_features = out_features or in_features
	hidden_features = hidden_features or in_features
	self.fc1 = nn.Linear(in_features, hidden_features)
	self.act = build_act_layer(act_layer)
	self.fc2 = nn.Linear(hidden_features, out_features)
	self.drop = nn.Dropout(drop)

	def forward(self, x):
	x = self.fc1(x)
	x = self.act(x)
	x = self.drop(x)
	x = self.fc2(x)
	x = self.drop(x)
	return x


	class InternImageLayer(nn.Module):
	r"""Basic layer of InternImage
	Args:
	core_op (nn.Module): core operation of InternImage
	channels (int): number of input channels
	groups (list): Groups of each block.
	mlp_ratio (float): ratio of mlp hidden features to input channels
	drop (float): dropout rate
	drop_path (float): drop path rate
	act_layer (str): activation layer
	norm_layer (str): normalization layer
	post_norm (bool): whether to use post normalization
	layer_scale (float): layer scale
	offset_scale (float): offset scale
	with_cp (bool): whether to use checkpoint
	"""

	def __init__(self,
	core_op,
	channels,
	groups,
	mlp_ratio=4.,
	drop=0.,
	drop_path=0.,
	act_layer='GELU',
	norm_layer='LN',
	post_norm=False,
	layer_scale=None,
	offset_scale=1.0,
	with_cp=False,
	dw_kernel_size=None, # for InternImage-H/G
	res_post_norm=False, # for InternImage-H/G
	center_feature_scale=False, # for InternImage-H/G
	remove_center=False, # for InternImage-H/G
	):
	super().__init__()
	self.channels = channels
	self.groups = groups
	self.mlp_ratio = mlp_ratio
	self.with_cp = with_cp

	self.norm1 = build_norm_layer(channels, 'LN')
	self.post_norm = post_norm
	self.dcn = core_op(
	channels=channels,
	kernel_size=3,
	stride=1,
	pad=1,
	dilation=1,
	group=groups,
	offset_scale=offset_scale,
	act_layer=act_layer,
	norm_layer=norm_layer,
	dw_kernel_size=dw_kernel_size, # for InternImage-H/G
	center_feature_scale=center_feature_scale, # for InternImage-H/G
	remove_center=remove_center, # for InternImage-H/G
	)
	self.drop_path = DropPath(drop_path) if drop_path > 0. \
	else nn.Identity()
	self.norm2 = build_norm_layer(channels, 'LN')
	self.mlp = MLPLayer(in_features=channels,
	hidden_features=int(channels * mlp_ratio),
	act_layer=act_layer,
	drop=drop)
	self.layer_scale = layer_scale is not None
	if self.layer_scale:
	self.layer_scale1 = nn.Parameter(layer_scale * torch.ones(channels),
	requires_grad=True)
	self.layer_scale2 = nn.Parameter(layer_scale * torch.ones(channels),
	requires_grad=True)
	self.res_post_norm = res_post_norm
	if res_post_norm:
	self.res_post_norm1 = build_norm_layer(channels, 'LN')
	self.res_post_norm2 = build_norm_layer(channels, 'LN')

	def forward(self, x):

	def _inner_forward(x):
	if not self.layer_scale:
	if self.post_norm:
	x = x + self.drop_path(self.norm1(self.dcn(x)))
	x = x + self.drop_path(self.norm2(self.mlp(x)))
	elif self.res_post_norm: # for InternImage-H/G
	x = x + self.drop_path(self.res_post_norm1(self.dcn(self.norm1(x))))
	x = x + self.drop_path(self.res_post_norm2(self.mlp(self.norm2(x))))
	else:
	x = x + self.drop_path(self.dcn(self.norm1(x)))
	x = x + self.drop_path(self.mlp(self.norm2(x)))
	return x
	if self.post_norm:
	x = x + self.drop_path(self.layer_scale1 * self.norm1(self.dcn(x)))
	x = x + self.drop_path(self.layer_scale2 * self.norm2(self.mlp(x)))
	else:
	x = x + self.drop_path(self.layer_scale1 * self.dcn(self.norm1(x)))
	x = x + self.drop_path(self.layer_scale2 * self.mlp(self.norm2(x)))
	return x

	if self.with_cp and x.requires_grad:
	x = checkpoint.checkpoint(_inner_forward, x)
	else:
	x = _inner_forward(x)
	return x


	class InternImageBlock(nn.Module):
	r"""Block of InternImage
	Args:
	core_op (nn.Module): core operation of InternImage
	channels (int): number of input channels
	depths (list): Depth of each block.
	groups (list): Groups of each block.
	mlp_ratio (float): ratio of mlp hidden features to input channels
	drop (float): dropout rate
	drop_path (float): drop path rate
	act_layer (str): activation layer
	norm_layer (str): normalization layer
	post_norm (bool): whether to use post normalization
	layer_scale (float): layer scale
	offset_scale (float): offset scale
	with_cp (bool): whether to use checkpoint
	"""

	def __init__(self,
	core_op,
	channels,
	depth,
	groups,
	downsample=True,
	mlp_ratio=4.,
	drop=0.,
	drop_path=0.,
	act_layer='GELU',
	norm_layer='LN',
	post_norm=False,
	offset_scale=1.0,
	layer_scale=None,
	with_cp=False,
	dw_kernel_size=None, # for InternImage-H/G
	post_norm_block_ids=None, # for InternImage-H/G
	res_post_norm=False, # for InternImage-H/G
	center_feature_scale=False, # for InternImage-H/G
	remove_center=False, # for InternImage-H/G
	):
	super().__init__()
	self.channels = channels
	self.depth = depth
	self.post_norm = post_norm
	self.center_feature_scale = center_feature_scale

	self.blocks = nn.ModuleList([
	InternImageLayer(
	core_op=core_op,
	channels=channels,
	groups=groups,
	mlp_ratio=mlp_ratio,
	drop=drop,
	drop_path=drop_path[i] if isinstance(
	drop_path, list) else drop_path,
	act_layer=act_layer,
	norm_layer=norm_layer,
	post_norm=post_norm,
	layer_scale=layer_scale,
	offset_scale=offset_scale,
	with_cp=with_cp,
	dw_kernel_size=dw_kernel_size, # for InternImage-H/G
	res_post_norm=res_post_norm, # for InternImage-H/G
	center_feature_scale=center_feature_scale, # for InternImage-H/G
	remove_center=remove_center, # for InternImage-H/G
	) for i in range(depth)
	])
	if not self.post_norm or center_feature_scale:
	self.norm = build_norm_layer(channels, 'LN')
	self.post_norm_block_ids = post_norm_block_ids
	if post_norm_block_ids is not None: # for InternImage-H/G
	self.post_norms = nn.ModuleList(
	[build_norm_layer(channels, 'LN', eps=1e-6) for _ in post_norm_block_ids]
	)
	self.downsample = DownsampleLayer(
	channels=channels, norm_layer=norm_layer) if downsample else None

	def forward(self, x, return_wo_downsample=False):
	for i, blk in enumerate(self.blocks):
	x = blk(x)
	if (self.post_norm_block_ids is not None) and (i in self.post_norm_block_ids):
	index = self.post_norm_block_ids.index(i)
	x = self.post_norms[index](x) # for InternImage-H/G
	if not self.post_norm or self.center_feature_scale:
	x = self.norm(x)
	if return_wo_downsample:
	x_ = x
	if self.downsample is not None:
	x = self.downsample(x)

	if return_wo_downsample:
	return x, x_
	return x


	class InternImage(nn.Module):
	r"""InternImage
	A PyTorch impl of : `InternImage: Exploring Large-Scale Vision Foundation Models with Deformable Convolutions` -
	https://arxiv.org/pdf/2103.14030
	Args:
	core_op (str): Core operator. Default: 'DCNv3'
	channels (int): Number of the first stage. Default: 64
	depths (list): Depth of each block. Default: [3, 4, 18, 5]
	groups (list): Groups of each block. Default: [3, 6, 12, 24]
	num_classes (int): Number of classes. Default: 1000
	mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
	drop_rate (float): Probability of an element to be zeroed. Default: 0.
	drop_path_rate (float): Stochastic depth rate. Default: 0.
	act_layer (str): Activation layer. Default: 'GELU'
	norm_layer (str): Normalization layer. Default: 'LN'
	layer_scale (float): The initial value of layer scale. Default: None
	cls_scale (float): Whether to use class scale. Default: 1.5
	with_cp (bool): Use gradient checkpointing or not. Default: False
	dw_kernel_size (int): Size of the dwconv. Default: None
	use_clip_projector (bool): Whether to use clip projector. Default: False
	level2_post_norm (bool): Whether to use level2 post norm. Default: False
	level2_post_norm_block_ids (list): Indexes of post norm blocks. Default: None
	res_post_norm (bool): Whether to use res post norm. Default: False
	center_feature_scale (bool): Whether to use center feature scale. Default: False
	"""

	def __init__(self,
	core_op='DCNv3',
	channels=64,
	depths=[3, 4, 18, 5],
	groups=[3, 6, 12, 24],
	num_classes=1000,
	mlp_ratio=4.,
	drop_rate=0.,
	drop_path_rate=0.2,
	drop_path_type='linear',
	act_layer='GELU',
	norm_layer='LN',
	layer_scale=None,
	offset_scale=1.0,
	post_norm=False,
	cls_scale=1.5,
	with_cp=False,
	dw_kernel_size=None, # for InternImage-H/G
	use_clip_projector=False, # for InternImage-H/G
	level2_post_norm=False, # for InternImage-H/G
	level2_post_norm_block_ids=None, # for InternImage-H/G
	res_post_norm=False, # for InternImage-H/G
	center_feature_scale=False, # for InternImage-H/G
	remove_center=False, # for InternImage-H/G
	**kwargs):
	super().__init__()
	if core_op == 'DCNv3' and has_cuda_kernel:
	self.core_op = DCNv3
	print('DCNv3 is installed, using CUDA implementation.')
	elif core_op == 'DCNv3' and not has_cuda_kernel:
	self.core_op = DCNv3_pytorch
	print('DCNv3 is not installed, using PyTorch implementation.')
	else:
	self.core_op = DCNv3_pytorch
	print('Using DCNv3 PyTorch implementation.')
	self.num_classes = num_classes
	self.num_levels = len(depths)
	self.depths = depths
	self.channels = channels
	self.num_features = int(channels * 2 ** (self.num_levels - 1))
	self.post_norm = post_norm
	self.mlp_ratio = mlp_ratio
	self.use_clip_projector = use_clip_projector
	self.level2_post_norm_block_ids = level2_post_norm_block_ids
	self.remove_center = remove_center

	print(f'using core type: {core_op}')
	print(f'level2_post_norm: {level2_post_norm}')
	print(f'level2_post_norm_block_ids: {level2_post_norm_block_ids}')
	print(f'res_post_norm: {res_post_norm}')
	print(f'remove_center: {remove_center}')

	in_chans = 3
	self.patch_embed = StemLayer(in_chans=in_chans,
	out_chans=channels,
	act_layer=act_layer,
	norm_layer=norm_layer)
	self.pos_drop = nn.Dropout(p=drop_rate)

	dpr = [
	x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))
	]
	if drop_path_type == 'uniform':
	for i in range(len(dpr)):
	dpr[i] = drop_path_rate

	self.levels = nn.ModuleList()
	for i in range(self.num_levels):
	post_norm_block_ids = level2_post_norm_block_ids if level2_post_norm and (
	i == 2) else None # for InternImage-H/G
	level = InternImageBlock(
	core_op=self.core_op,
	channels=int(channels * 2 ** i),
	depth=depths[i],
	groups=groups[i],
	mlp_ratio=self.mlp_ratio,
	drop=drop_rate,
	drop_path=dpr[sum(depths[:i]):sum(depths[:i + 1])],
	act_layer=act_layer,
	norm_layer=norm_layer,
	post_norm=post_norm,
	downsample=(i < self.num_levels - 1),
	layer_scale=layer_scale,
	offset_scale=offset_scale,
	with_cp=with_cp,
	dw_kernel_size=dw_kernel_size, # for InternImage-H/G
	post_norm_block_ids=post_norm_block_ids, # for InternImage-H/G
	res_post_norm=res_post_norm, # for InternImage-H/G
	center_feature_scale=center_feature_scale, # for InternImage-H/G
	remove_center=remove_center, # for InternImage-H/G
	)
	self.levels.append(level)

	if self.num_classes > 0:
	if not use_clip_projector: # for InternImage-T/S/B/L/XL
	self.conv_head = nn.Sequential(
	nn.Conv2d(self.num_features,
	int(self.num_features * cls_scale),
	kernel_size=1,
	bias=False),
	build_norm_layer(int(self.num_features * cls_scale), 'BN',
	'channels_first', 'channels_first'),
	build_act_layer(act_layer))
	self.head = nn.Linear(int(self.num_features * cls_scale), num_classes) \
	if num_classes > 0 else nn.Identity()
	else: # for InternImage-H/G
	pretrain_embed_dim, _stride, attnpool_num_heads, clip_embed_dim = 1024, 2, 16, 768
	self.dcnv3_head_x4 = nn.Sequential(
	nn.Conv2d(in_channels=self.num_features,
	out_channels=pretrain_embed_dim * (_stride ** 2),
	kernel_size=1), nn.PixelShuffle(_stride))
	self.dcnv3_head_x3 = nn.Conv2d(in_channels=self.num_features // 2,
	out_channels=pretrain_embed_dim,
	kernel_size=1)
	self.clip_projector = AttentionPoolingBlock(
	dim=pretrain_embed_dim,
	num_heads=attnpool_num_heads,
	qkv_bias=True,
	qk_scale=None,
	drop=0.,
	attn_drop=0.,
	norm_layer=norm_layer,
	out_dim=clip_embed_dim)
	self.fc_norm = build_norm_layer(clip_embed_dim, norm_layer, eps=1e-6)
	self.head = nn.Linear(
	clip_embed_dim, num_classes) if num_classes > 0 else nn.Identity()

	self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
	self.num_layers = len(depths)
	self.apply(self._init_weights)
	self.apply(self._init_deform_weights)

	def _init_weights(self, m):
	if isinstance(m, nn.Linear):
	trunc_normal_(m.weight, std=.02)
	if isinstance(m, nn.Linear) and m.bias is not None:
	nn.init.constant_(m.bias, 0)
	elif isinstance(m, nn.LayerNorm):
	nn.init.constant_(m.bias, 0)
	nn.init.constant_(m.weight, 1.0)

	def _init_deform_weights(self, m):
	if isinstance(m, self.core_op):
	m._reset_parameters()

	@torch.jit.ignore
	def lr_decay_keywords(self, decay_ratio=0.87):
	lr_ratios = {}

	# blocks
	idx = 0
	for i in range(4):
	layer_num = 3 - i # 3 2 1 0
	for j in range(self.depths[layer_num]):
	block_num = self.depths[layer_num] - j - 1
	tag = 'levels.{}.blocks.{}.'.format(layer_num, block_num)
	decay = 1.0 * (decay_ratio ** idx)
	lr_ratios[tag] = decay
	idx += 1
	# patch_embed (before stage-1)
	lr_ratios['patch_embed'] = lr_ratios['levels.0.blocks.0.']
	# levels.0.downsample (between stage-1 and stage-2)
	lr_ratios['levels.0.downsample'] = lr_ratios['levels.1.blocks.0.']
	lr_ratios['levels.0.norm'] = lr_ratios['levels.1.blocks.0.']
	# levels.1.downsample (between stage-2 and stage-3)
	lr_ratios['levels.1.downsample'] = lr_ratios['levels.2.blocks.0.']
	lr_ratios['levels.1.norm'] = lr_ratios['levels.2.blocks.0.']
	# levels.2.downsample (between stage-3 and stage-4)
	lr_ratios['levels.2.downsample'] = lr_ratios['levels.3.blocks.0.']
	lr_ratios['levels.2.norm'] = lr_ratios['levels.3.blocks.0.']
	return lr_ratios

	def forward_features_seq_out(self, x):
	x = self.patch_embed(x)
	x = self.pos_drop(x)

	seq_out = []
	for level in self.levels:
	x, x_ = level(x, return_wo_downsample=True)
	seq_out.append(x_)
	return seq_out

	def forward_features(self, x):
	xs = self.forward_features_seq_out(x)
	x1, x2, x3, x4 = xs

	x1 = x1.permute(0, 3, 1, 2) # NHWC -> NCHW
	x2 = x2.permute(0, 3, 1, 2) # NHWC -> NCHW
	x3 = x3.permute(0, 3, 1, 2) # NHWC -> NCHW
	x4 = x4.permute(0, 3, 1, 2) # NHWC -> NCHW
	hidden_states = [x1, x2, x3, x4]

	if self.num_classes > 0:
	x = self.conv_head(x4)
	x = self.avgpool(x)
	x = torch.flatten(x, 1)

	return {
	'hidden_states': hidden_states,
	'pooler_output': x if self.num_classes > 0 else None
	}

	def forward_clip_projector(self, x): # for InternImage-H/G
	xs = self.forward_features_seq_out(x)
	x1, x2, x3, x4 = xs

	x1 = x1.permute(0, 3, 1, 2) # NHWC -> NCHW
	x2 = x2.permute(0, 3, 1, 2) # NHWC -> NCHW
	x3 = x3.permute(0, 3, 1, 2) # NHWC -> NCHW
	x4 = x4.permute(0, 3, 1, 2) # NHWC -> NCHW
	hidden_states = [x1, x2, x3, x4]

	if self.num_classes > 0:
	x4 = self.dcnv3_head_x4(x4)
	x = x4
	x3 = self.dcnv3_head_x3(x3)
	x = x + x3

	x = x.flatten(-2).transpose(1, 2).contiguous()
	x = self.clip_projector(x)
	x = self.fc_norm(x)

	return {
	'hidden_states': hidden_states,
	'pooler_output': x if self.num_classes > 0 else None
	}

	def forward(self, x):
	if self.use_clip_projector: # for InternImage-H/G
	outputs = self.forward_clip_projector(x)
	else: # for InternImage-T/S/B/L/XL
	outputs = self.forward_features(x)

	hidden_states = outputs['hidden_states']
	pooler_output = outputs['pooler_output']

	if self.num_classes > 0:
	logits = self.head(pooler_output)
	else:
	logits = None

	return BackboneOutput(
	hidden_states=hidden_states,
	last_hidden_state=hidden_states[-1],
	pooler_output=pooler_output,
	logits=logits
	)


	class InternImageModel(PreTrainedModel):
	config_class = InternImageConfig

	def __init__(self, config):
	super().__init__(config)
	self.model = InternImage(
	core_op=config.core_op,
	channels=config.channels,
	depths=config.depths,
	groups=config.groups,
	num_classes=0,
	mlp_ratio=config.mlp_ratio,
	drop_rate=config.drop_rate,
	drop_path_rate=config.drop_path_rate,
	drop_path_type=config.drop_path_type,
	act_layer=config.act_layer,
	norm_layer=config.norm_layer,
	layer_scale=config.layer_scale,
	offset_scale=config.offset_scale,
	post_norm=config.post_norm,
	cls_scale=config.cls_scale,
	with_cp=config.with_cp,
	dw_kernel_size=config.dw_kernel_size, # for InternImage-H/G
	use_clip_projector=config.use_clip_projector, # for InternImage-H/G
	level2_post_norm=config.level2_post_norm, # for InternImage-H/G
	level2_post_norm_block_ids=config.level2_post_norm_block_ids, # for InternImage-H/G
	res_post_norm=config.res_post_norm, # for InternImage-H/G
	center_feature_scale=config.center_feature_scale, # for InternImage-H/G
	remove_center=config.remove_center, # for InternImage-H/G
	)

	def forward(self, pixel_values):
	return self.model.forward_features(pixel_values)


	class InternImageModelForImageClassification(PreTrainedModel):
	config_class = InternImageConfig

	def __init__(self, config):
	super().__init__(config)
	self.model = InternImage(
	core_op=config.core_op,
	channels=config.channels,
	depths=config.depths,
	groups=config.groups,
	num_classes=config.num_classes,
	mlp_ratio=config.mlp_ratio,
	drop_rate=config.drop_rate,
	drop_path_rate=config.drop_path_rate,
	drop_path_type=config.drop_path_type,
	act_layer=config.act_layer,
	norm_layer=config.norm_layer,
	layer_scale=config.layer_scale,
	offset_scale=config.offset_scale,
	post_norm=config.post_norm,
	cls_scale=config.cls_scale,
	with_cp=config.with_cp,
	dw_kernel_size=config.dw_kernel_size, # for InternImage-H/G
	use_clip_projector=config.use_clip_projector, # for InternImage-H/G
	level2_post_norm=config.level2_post_norm, # for InternImage-H/G
	level2_post_norm_block_ids=config.level2_post_norm_block_ids, # for InternImage-H/G
	res_post_norm=config.res_post_norm, # for InternImage-H/G
	center_feature_scale=config.center_feature_scale, # for InternImage-H/G
	remove_center=config.remove_center, # for InternImage-H/G
	)

	def forward(self, pixel_values, labels=None):
	outputs = self.model.forward(pixel_values)

	if labels is not None:
	logits = outputs['logits']
	loss = F.cross_entropy(logits, labels)
	outputs['loss'] = loss

	return outputs