1. TSNE 可视化
# 3D / 2D可视化
def TSNE_API(data, label):
"""
:param data: numpy array N x dim
:param label: N
:return:
"""
print('data shape: {}, label shape: {}'.format(data.shape, label.shape))
tsne = TSNE(n_components=2, init='random', random_state=100, perplexity=40, n_iter=1000)
# data
# normalize data
# data_norm = F.normalize(data, p=2, dim=-1)
data = tsne.fit_transform(data)
x_min, x_max = np.min(data, 0), np.max(data, 0)
data_norm = (data - x_min) / (x_max - x_min)
# 指定要可视化的类别数目 N 和列表 plot_class_list
# N = 20
# unique_cls = np.unique(label)[:N].tolist() # 只绘制前十个类别
# plot_class_list = unique_cls
plot_class_list = [ 79, 86, 87, 91] # 71, 90
N = len(plot_class_list)
cls_num = len(plot_class_list)
color_list = ["#000000", "#B22222", "#9400D3", "#FFA500"] # "#000000", "#8B0000", # get_color_list(num=cls_num)
fig = plt.figure(dpi=600)
# ax = plt.subplot()
ax = Axes3D(fig)
flag = [0] * N
for i in range(data_norm.shape[0]):
ss = 10
marker = 'o'
alpha = 0.6
if label[i] in plot_class_list:
index = plot_class_list.index(label[i])
ax.scatter(data_norm[i, 0], data_norm[i, 1], 0, s=10, c=color_list[index], alpha=alpha, marker=marker, label=label[i])
if flag[index] == 0:
ax.text(data_norm[i, 0], data_norm[i, 1], 0, str(label[i]), fontdict={'weight': 'bold', 'size': 10}, color="#000000")
# plt.text(data_norm[i, 0], data_norm[i, 1], data_norm[i, 2], 60, str(label[i]), size=10, alpha=0.8, weight="bold", color="#000000")
flag[index] = 1
# plt.legend()
ax.set_xlabel('X label')
ax.set_ylabel('Y label')
# ax.set_zlabel('Z label')
# plt.axis('off')
# bbox_inches='tight', pad_inches=0 去除图像空白
# plt.savefig('/data/code/C2F_ZSL/visualize/awa2_tsne_10_class_mix.png', dpi=1000, bbox_inches='tight', pad_inches=0)
plt.show()
return fig
2. 关于hub
问题的评估指标斜度
计算
给定一个数据集作为输入,每条数据都是一个高维向量,hub
问题指的是存在一些数据点常常作为其它许多数据点的 $k$ 近邻点,即使这些点不属于同一类别。
斜度Skewness
表示是数据点 $x$ 作为其它数据点的紧邻点的次数,记为$N_{k}^{h}(x)$, 该变量分布倾斜度指标为斜度,特别地有正态分布的斜度为0.
对于下方的代码输入为一个含有N个数据点的数据集,$k$ 近邻默认 $k=5$
def Skewness(pkl_data_path, N=5):
# 读取数据
all_dataset = pickle.load(open(pkl_data_path, 'rb'), encoding='utf-8')
tu_data, tu_b_label = all_dataset['test_unseen']['fea'], all_dataset['test_unseen']['labels']
tu_data = t.from_numpy(tu_data).cuda()
print("data points number: {}".format(len(tu_data))) # L
# L2 norm 可选
tu_data_norm = F.normalize(tu_data, p=2, dim=-1)
# 计算余弦相似度,也可以替换为欧氏距离
cos_sim = t.mm(tu_data_norm, tu_data_norm.t()) # L x L
# 获取每个数据点最邻近的 k 个数据点的索引
values, indices = cos_sim.topk(N, dim=1, largest=True, sorted=True)
print("KNN size: {}".format(values.size()))
indices = indices.cpu().numpy()
# 统计每个数据点作为其它数据点最近邻的次数
count_list = []
for i in range(len(tu_data)):
count_list.append(np.sum(indices==i))
# for i in range(len(tu_data)):
# count_list.append(np.sum(indices==i))
#
print("check count_list: {}".format(sum(count_list)))
#
# sns.displot(count_list, bins=30, kde=True, rug=True)
# plt.legend("v2s", loc='best')
# plt.show()
# plt.legend()
# 计算斜度 https://www.cnblogs.com/jiaxin359/p/8977333.html
sc = pd.Series(count_list)
print(sc.skew())
print(count_list.mean())
# print(sc.kurt())
PREVIOUS自监督学习self Supervised Learning
NEXT论文实验记录