各种距离算法小记

2025年10月7日 50点热度 0人点赞 0条评论

最近在翻一些距离算法并考虑他们的GPU并行化实现,记录一下。

这里从图片转成向量开始,利用opencv配合torchvision,转成tensor还是比较容易的。

import cv2
import torchvision.transforms.functional as F
import torchvision.transforms as T

image = cv2.imread('test_pic1.png')

print(type(image),image.shape)
img1 = F.to_tensor(image)
print(type(img1),img1.shape)
print(img1)

这里的to_tensor的大致实现如下:

def to_tensor(pic):
    """Convert a ``PIL Image`` or ``numpy.ndarray`` to tensor.
    This function does not support torchscript.

    See :class:`~torchvision.transforms.ToTensor` for more details.

    Args:
        pic (PIL Image or numpy.ndarray): Image to be converted to tensor.

    Returns:
        Tensor: Converted image.
    """
    if not(F_pil._is_pil_image(pic) or _is_numpy(pic)):
        raise TypeError('pic should be PIL Image or ndarray. Got {}'.format(type(pic)))

    if _is_numpy(pic) and not _is_numpy_image(pic):
        raise ValueError('pic should be 2/3 dimensional. Got {} dimensions.'.format(pic.ndim))

    default_float_dtype = torch.get_default_dtype()


    if isinstance(pic, np.ndarray):
        # handle numpy array
        # 如果是灰度的二维图像,加一个通道维度
        if pic.ndim == 2:
            pic = pic[:, :, None]

        img = torch.from_numpy(pic.transpose((2, 0, 1))).contiguous()
        # backward compatibility
        # 除以255
        if isinstance(img, torch.ByteTensor):
            return img.to(dtype=default_float_dtype).div(255)
        else:
            return img

    if accimage is not None and isinstance(pic, accimage.Image):
        nppic = np.zeros([pic.channels, pic.height, pic.width], dtype=default_float_dtype)
        pic.copyto(nppic)
        return torch.from_numpy(nppic)

    # 处理PIL的照片
    if pic.mode == 'I':
        img = torch.from_numpy(np.array(pic, np.int32, copy=False))
    elif pic.mode == 'I;16':
        img = torch.from_numpy(np.array(pic, np.int16, copy=False))
    elif pic.mode == 'F':
        img = torch.from_numpy(np.array(pic, np.float32, copy=False))
    elif pic.mode == '1':
        img = 255 * torch.from_numpy(np.array(pic, np.uint8, copy=False))
    else:
        img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))

    img = img.view(pic.size[1], pic.size[0], len(pic.getbands()))
    # 把HCW转成CHW格式
    img = img.permute((2, 0, 1)).contiguous()
    if isinstance(img, torch.ByteTensor):
        return img.to(dtype=default_float_dtype).div(255)
    else:
        return img

从熟悉的欧氏距离开始,一个经典的欧氏距离就是两点之间计算,公式如下:

d(p,q) = \sqrt {(p_1 - q_1 )^2 + (p_2 - q_2 )^2 }

在几何距离和数学建模常见的聚类(例如K-Means)计算相对常用。

对于两个点之间的差异,还有一个常用于网格路径优化的距离就是曼哈顿距离,本身的定义是两个坐标在坐标轴上的绝对值之和。

d = \left| {x_1 - x_2 } \right| + \left| {y_1 - y_2 } \right|

pytorch的快速实现:

import torch
def EuclideanDistances(a,b):
    sq_a = a**2
    sum_sq_a = torch.sum(sq_a,dim=1).unsqueeze(1)  # m->[m, 1]
    print(sum_sq_a)
    sq_b = b**2
    sum_sq_b = torch.sum(sq_b,dim=1).unsqueeze(0)  # n->[1, n]
    bt = b.t()
    return torch.sqrt(sum_sq_a+sum_sq_b-2*a.mm(bt))
eu_dis = EuclideanDistances(img1.view(1,-1), img1.view(1,-1))
print(eu_dis)  #两个tensor的欧氏距离,应该是0,但由于精度问题,结果会有误差,如果改为float64结果就稳定很多

此时就得引入范数的概念了,因为pytorch使用的是统一的距离计算函数,采用的就是范数的概念。

对于一个n维的向量x = (x_1 ,x_2 ,x_3 ,{\rm{\cdot\cdot\cdot}},x_n ),p范数是向量各元素取p次方求和再开p次方根来衡量向量的长度。对应的式子:||x||_p = ( \sum_{i=1}^{n} |x_i|^p )^{1/p}

从这里也能看的出来,曼哈顿距离对应的就是L1的范数,欧氏距离对应的是L2范数,torch在CPU相关的代码如下,这里可以发现torch在传给具体硬件进行计算之前会将两个tensor进行广播和展平,在后续的运算中提供了数据的并行性。

template <typename F>
// 这里的张量是已经处理好的,
static void run_parallel_pdist(Tensor& result, const Tensor& self, const scalar_t p) {
  const scalar_t * const self_start = self.const_data_ptr<scalar_t>();
  const scalar_t * const self_end = self_start + self.numel();
  int64_t n = self.size(0);
  int64_t m = self.size(1);

  scalar_t * const res_start = result.data_ptr<scalar_t>();
  int64_t combs = result.numel(); // n * (n - 1) / 2

  // 并行计算,将输出索引k范围[0,comb-1]分成多个块,每个线程处理一段连续的k值
  parallel_for(0, combs, internal::GRAIN_SIZE / (16 * m), [p, self_start, self_end, n, m, res_start](int64_t k, int64_t end) {
    const Vec pvec(p);
    double n2 = n - .5;
    // The -1 accounts for floating point truncation issues
    // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
    // 将k映射到矩阵的i和j
    int64_t i = static_cast<int64_t>((n2 - std::sqrt(n2 * n2 - 2 * k - 1)));
    int64_t j = k - n * i + i * (i + 1) / 2 + i + 1;

    const scalar_t * self_i = self_start + i * m;
    const scalar_t * self_j = self_start + j * m;
    scalar_t * res = res_start + k;
    const scalar_t * const res_end = res_start + end;

    while (res != res_end) {
      // 这里的向量化操作调用了CPU的SIMD,先计算|a-b|^p,再开p次方
      *res = F::finish(vec::map2_reduce_all<scalar_t>(
        [&pvec](Vec a, Vec b) { return F::map((a - b).abs(), pvec); },
        F::red, self_i, self_j, m), p);

      res += 1;
      self_j += m;
      if (self_j == self_end) {
        self_i += m;
        self_j = self_i + m;
      }
    }
  });
}

在cuda的实现如下:

template <typename scalar_t, typename F>
__global__ static void cdist_backward_kernel_cuda_impl(scalar_t * buffer, const scalar_t * grad, const scalar_t * x1, const scalar_t * x2, const scalar_t * dist,
                                                       const scalar_t p, const int64_t r1, const int64_t r2, const int64_t m, const int64_t count, const int64_t r_size, const int64_t l1_size, const int64_t l2_size) {
  // 线程分配                   
  const int y = (blockIdx.y * gridDim.z + blockIdx.z) * blockDim.y + threadIdx.y;
  const int init = blockIdx.x * blockDim.x + threadIdx.x;
  if (y >= count || init >= m) {
    return;
  }
  const int l = y / r_size;
  const int k = y % r_size;
  const int stride = blockDim.x * gridDim.x;
  const int l_size = r_size * m;

  int64_t i = k / r2;
  int64_t j = k % r2;

  const scalar_t grad_k = grad[y];
  const scalar_t dist_k = dist[y];

  const scalar_t * const start = x1 + l * l1_size + i * m;
  const scalar_t * const end = start + m;
  const scalar_t * self_i = start + init;
  const scalar_t * self_j = x2 + l * l2_size + j * m + init;

  scalar_t * buff_i = buffer + l * l_size + (r1 * j + i) * m + init;

  // 内存访问连续
  for (; self_i < end; self_i += stride, self_j += stride, buff_i += stride) {
    const scalar_t res = F::backward(*self_i - *self_j, grad_k, dist_k, p);
    *buff_i = res;
  }
}

在分布比较和一些GAN生成对抗网络(https://arxiv.org/abs/1701.07875)中,度量距离的方法有一个是Wasserstein距离的方法,这里的距离度量的就不是两个向量了,而是两个概率分布。我们将两个分布P,Q看成两堆土,如下图所示,希望把其中的一堆土移成另一堆土的位置和形状,有很多种可能的方案。推土代价被定义为移动土的量乘以土移动的距离,在所有的方案中,存在一种推土代价最小的方案,这个代价就称为两个分布的 Wasserstein 距离,也因此被称为推土机距离。

数学的表达太过复杂,不过有大佬写好了这些方便理解。

https://zhuanlan.zhihu.com/p/441197063

文章通过做功的方式给出了直观的理解,将求解处理成了线性规划问题。

先看一下每对图像之间的距离如何计算(调库)的:

query_image = cv2.imread('query_image.png', cv2.IMREAD_GRAYSCALE)
img = cv2.imread(f'image_{i}.png', cv2.IMREAD_GRAYSCALE)
hist, _ = np.histogram(query_image, bins=10, range=(0, 255), density=True)

# 计算距离,这里的cv2.DIST_L2正是之前提到的欧氏距离,在这里用来度量向量每个元素距离的
hist2, _ = np.histogram(img, bins=10, range=(0, 255), density=True)
distance = cv2.EMD(hist, hist2, cv2.DIST_L2)

对应的代码文件是opencv源码仓库下的opencv/modules/imgproc/src/emd_new.cpp中,可以看到EMD函数本身是创建了个求解器进行计算的。

float cv::EMD(InputArray _sign1,
              InputArray _sign2,
              int distType,
              InputArray _cost,
              float* lowerBound,
              OutputArray _flow)
{
    CV_INSTRUMENT_REGION();

    Mat sign1 = _sign1.getMat();
    Mat sign2 = _sign2.getMat();
    Mat cost = _cost.getMat();

    CV_CheckEQ(sign1.cols, sign2.cols, "Signatures must have equal number of columns");
    CV_CheckEQ(sign1.type(), CV_32FC1, "The sign1 must be 32FC1");
    CV_CheckEQ(sign2.type(), CV_32FC1, "The sign2 must be 32FC1");

    const int dims = sign1.cols - 1;
    const int size1 = sign1.rows;
    const int size2 = sign2.rows;

    Mat flow;
    if (_flow.needed())
    {
        _flow.create(sign1.rows, sign2.rows, CV_32F);
        flow = _flow.getMat();
        flow = Scalar::all(0);
        CV_CheckEQ(flow.type(), CV_32FC1, "Flow matrix must have type 32FC1");
        CV_CheckTrue(flow.rows == size1 && flow.cols == size2,
                     "Flow matrix size does not match signatures");
    }

    DistFunc dfunc = 0;
    if (distType == DIST_USER)
    {
        if (!cost.empty())
        {
            CV_CheckEQ(cost.type(), CV_32FC1, "Cost matrix must have type 32FC1");
            CV_CheckTrue(cost.rows == size1 && cost.cols == size2,
                         "Cost matrix size does not match signatures");
            CV_CheckTrue(lowerBound == NULL,
                         "Lower boundary can not be calculated if the cost matrix is used");
        }
        else
        {
            CV_CheckTrue(dfunc == NULL, "Dist function must be set if cost matrix is empty");
        }
    }
    else
    {
        CV_CheckNE(dims, 0, "Number of dimensions can be 0 only if a user-defined metric is used");
        switch (distType)
        {
            case cv::DIST_L1: dfunc = distL1; break;
            case cv::DIST_L2: dfunc = distL2; break;
            case cv::DIST_C: dfunc = distC; break;
            default: CV_Error(cv::Error::StsBadFlag, "Bad or unsupported metric type");
        }
    }

    EMDSolver state;
    const bool result = state.init(sign1, sign2, dims, dfunc, cost, lowerBound);
    if (!result && lowerBound)
    {
        return *lowerBound;
    }
    // 调用迭代求解的函数
    state.solve();
    return (float)(state.calcFlow(_flow.needed() ? &flow : 0) / state.getWeight());
}

// 开始迭代求解
void EMDSolver::solve()
{
    if (ssize > 1 && dsize > 1)
    {
        const float eps = CV_EMD_EPS * max_cost;
        for (int itr = 1; itr < MAX_ITERATIONS; itr++)
        {
            /* find basic variables */
            if (findBasicVars() < 0)
                break;

            /* check for optimality */
            const float min_delta = checkOptimal();

            if (min_delta == CV_EMD_INF)
                CV_Error(cv::Error::StsNoConv, "");

            /* if no negative deltamin, we found the optimal solution */
            if (min_delta >= -eps)
                break;

            /* improve solution */
            if (!checkNewSolution())
                CV_Error(cv::Error::StsNoConv, "");
        }
    }
}

求解的基本集中于findBasicVars这块,对应的代码在下面,可以看到opencv是通过将u和v的节点做成一个链表进行管理,数据上并没有并行,从计算本身也没看出什么并行。

int EMDSolver::findBasicVars() const
{
    int i, j;
    int u_cfound, v_cfound;
    Node1D u0_head, u1_head, *cur_u, *prev_u;
    Node1D v0_head, v1_head, *cur_v, *prev_v;
    bool found;

    CV_Assert(u != 0 && v != 0);

    /* initialize the rows list (u) and the columns list (v) */
    u0_head.next = u;
    for (i = 0; i < ssize; i++)
    {
        u[i].next = u + i + 1;
    }
    u[ssize - 1].next = 0;
    u1_head.next = 0;

    v0_head.next = ssize > 1 ? v + 1 : 0;
    for (i = 1; i < dsize; i++)
    {
        v[i].next = v + i + 1;
    }
    v[dsize - 1].next = 0;
    v1_head.next = 0;

    /* there are ssize+dsize variables but only ssize+dsize-1 independent equations,
       so set v[0]=0 */
    v[0].val = 0;
    v1_head.next = v;
    v1_head.next->next = 0;

    /* loop until all variables are found */
    u_cfound = v_cfound = 0;
    while (u_cfound < ssize || v_cfound < dsize)
    {
        found = false;
        if (v_cfound < dsize)
        {
            /* loop over all marked columns */
            prev_v = &v1_head;
            cur_v = v1_head.next;
            found = found || (cur_v != 0);
            for (; cur_v != 0; cur_v = cur_v->next)
            {
                float cur_v_val = cur_v->val;

                j = (int)(cur_v - v);
                /* find the variables in column j */
                prev_u = &u0_head;
                for (cur_u = u0_head.next; cur_u != 0;)
                {
                    i = (int)(cur_u - u);
                    if (getIsX(i, j))
                    {
                        /* compute u[i] */
                        cur_u->val = getCost(i, j) - cur_v_val;
                        /* ...and add it to the marked list */
                        prev_u->next = cur_u->next;
                        cur_u->next = u1_head.next;
                        u1_head.next = cur_u;
                        cur_u = prev_u->next;
                    }
                    else
                    {
                        prev_u = cur_u;
                        cur_u = cur_u->next;
                    }
                }
                prev_v->next = cur_v->next;
                v_cfound++;
            }
        }

        if (u_cfound < ssize)
        {
            /* loop over all marked rows */
            prev_u = &u1_head;
            cur_u = u1_head.next;
            found = found || (cur_u != 0);
            for (; cur_u != 0; cur_u = cur_u->next)
            {
                float cur_u_val = cur_u->val;
                i = (int)(cur_u - u);
                /* find the variables in rows i */
                prev_v = &v0_head;
                for (cur_v = v0_head.next; cur_v != 0;)
                {
                    j = (int)(cur_v - v);
                    if (getIsX(i, j))
                    {
                        /* compute v[j] */
                        cur_v->val = getCost(i, j) - cur_u_val;
                        /* ...and add it to the marked list */
                        prev_v->next = cur_v->next;
                        cur_v->next = v1_head.next;
                        v1_head.next = cur_v;
                        cur_v = prev_v->next;
                    }
                    else
                    {
                        prev_v = cur_v;
                        cur_v = cur_v->next;
                    }
                }
                prev_u->next = cur_u->next;
                u_cfound++;
            }
        }

        if (!found)
            return -1;
    }

    return 0;
}

MuWinds

这个人很懒,什么都没留下

文章评论