可视化方法

 

1. TSNE 可视化

# 3D / 2D可视化
def TSNE_API(data, label):
    """

    :param data: numpy array N x dim
    :param label: N
    :return:
    """
    print('data shape: {}, label shape: {}'.format(data.shape, label.shape))

    tsne = TSNE(n_components=2, init='random', random_state=100, perplexity=40, n_iter=1000)
    # data
    # normalize data
    # data_norm = F.normalize(data, p=2, dim=-1)
    data = tsne.fit_transform(data)

    x_min, x_max = np.min(data, 0), np.max(data, 0)
    data_norm = (data - x_min) / (x_max - x_min)

    # 指定要可视化的类别数目 N 和列表 plot_class_list
    # N = 20
    # unique_cls = np.unique(label)[:N].tolist()     # 只绘制前十个类别
    # plot_class_list = unique_cls
    plot_class_list = [ 79, 86, 87, 91]  # 71, 90
    N = len(plot_class_list)

    cls_num = len(plot_class_list)
    color_list = ["#000000", "#B22222", "#9400D3", "#FFA500"] # "#000000", "#8B0000",         # get_color_list(num=cls_num)

    fig = plt.figure(dpi=600)
    # ax = plt.subplot()
    ax = Axes3D(fig)

    flag = [0] * N

    for i in range(data_norm.shape[0]):
        ss = 10
        marker = 'o'
        alpha = 0.6
        if label[i] in plot_class_list:
            index = plot_class_list.index(label[i])
            ax.scatter(data_norm[i, 0], data_norm[i, 1], 0, s=10, c=color_list[index], alpha=alpha, marker=marker, label=label[i])
            if flag[index] == 0:
                ax.text(data_norm[i, 0], data_norm[i, 1], 0, str(label[i]), fontdict={'weight': 'bold', 'size': 10}, color="#000000")
                # plt.text(data_norm[i, 0], data_norm[i, 1], data_norm[i, 2], 60, str(label[i]), size=10, alpha=0.8, weight="bold", color="#000000")
                flag[index] = 1

    # plt.legend()
    ax.set_xlabel('X label')
    ax.set_ylabel('Y label')
    # ax.set_zlabel('Z label')
    # plt.axis('off')

    # bbox_inches='tight',  pad_inches=0 去除图像空白
    # plt.savefig('/data/code/C2F_ZSL/visualize/awa2_tsne_10_class_mix.png', dpi=1000, bbox_inches='tight', pad_inches=0)
    plt.show()

    return fig

2. 关于hub问题的评估指标斜度计算

给定一个数据集作为输入,每条数据都是一个高维向量,hub问题指的是存在一些数据点常常作为其它许多数据点的 $k$ 近邻点,即使这些点不属于同一类别。 斜度Skewness表示是数据点 $x$ 作为其它数据点的紧邻点的次数,记为$N_{k}^{h}(x)$, 该变量分布倾斜度指标为斜度,特别地有正态分布的斜度为0.

对于下方的代码输入为一个含有N个数据点的数据集,$k$ 近邻默认 $k=5$

def Skewness(pkl_data_path, N=5):
    # 读取数据
    all_dataset = pickle.load(open(pkl_data_path, 'rb'), encoding='utf-8')
    tu_data, tu_b_label = all_dataset['test_unseen']['fea'], all_dataset['test_unseen']['labels']
    tu_data = t.from_numpy(tu_data).cuda()

    print("data points number: {}".format(len(tu_data)))      # L

    # L2 norm 可选
    tu_data_norm = F.normalize(tu_data, p=2, dim=-1)
    # 计算余弦相似度,也可以替换为欧氏距离
    cos_sim = t.mm(tu_data_norm, tu_data_norm.t())            # L x L
    # 获取每个数据点最邻近的 k 个数据点的索引
    values, indices = cos_sim.topk(N, dim=1, largest=True, sorted=True)

    print("KNN size: {}".format(values.size()))
    indices = indices.cpu().numpy()
    # 统计每个数据点作为其它数据点最近邻的次数
    count_list = []
    for i in range(len(tu_data)):
        count_list.append(np.sum(indices==i))
    # for i in range(len(tu_data)):
    #     count_list.append(np.sum(indices==i))
    #
    print("check count_list: {}".format(sum(count_list)))
    #
    # sns.displot(count_list, bins=30, kde=True, rug=True)
    # plt.legend("v2s", loc='best')
    # plt.show()
    # plt.legend()
    # 计算斜度 https://www.cnblogs.com/jiaxin359/p/8977333.html
    sc = pd.Series(count_list)
    print(sc.skew())
    print(count_list.mean())
    # print(sc.kurt())