0%

x86平台上向量化编程及实验

发表于 2023-08-01 更新于 2024-04-27 阅读次数：
本文字数： 2.4k 阅读时长 ≈ 4 分钟

向量化编程

使用向量化即“批量操作”，批量操作在物理生活中也很常见，在计算机中最常见的执行模型就是SIMD（Single Instruction Multiple Data），即对批量的数据同时进行同样的操作以提高效率。
Intel向量化操作手册

Intel向量化编程示例

代码案例

#include <stdio.h>
#include <stdint.h>
#include <time.h>
#include <immintrin.h>  // Header for AVX intrinsics

float dot_productVec(float* a, float* b, int length) {
    __m256 sum = _mm256_setzero_ps();
    int i;
    for (i = 0; i < length; i += 8) {
        __m256 vecA = _mm256_loadu_ps(&a[i]);
        __m256 vecB = _mm256_loadu_ps(&b[i]);
        __m256 mul = _mm256_mul_ps(vecA, vecB);
        sum = _mm256_add_ps(sum, mul);
    }

    // Sum the packed floats
    float result[8];
    _mm256_storeu_ps(result, sum);
    return result[0] + result[1] + result[2] + result[3] + result[4] + result[5] + result[6] + result[7];
}

float dot_productLoop(float* a, float* b, int length)
{
    int i = 0;
    for(int j = 0; j<length;j++)
    {
        i+=a[j]*b[j];
    }
    return i;
}

// Function to get the current time in nanoseconds
int64_t get_time_ns() {
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC, &ts);
    return ts.tv_sec * 1000000000LL + ts.tv_nsec;
}

void runTrial(int length, int64_t* tVec, int64_t* tLoop)
{
    if(length == 0){*tVec = 0;*tLoop = 0;return;}

    float* a = malloc(sizeof(float)*length);
    float* b = malloc(sizeof(float)*length);

    // int length = sizeof(a) / sizeof(float);
    for (int i = 0; i<length; i++)
    {
        a[i] = (float)i/10;
        b[i] = (float)(length-i)/10;
    }
    int64_t tVec1, tVec2, tLoop1, tLoop2 = 0;
    tVec1 = get_time_ns();
    float result = dot_productVec(a, b, length);
    tVec2 = get_time_ns();
    tLoop1 = get_time_ns();
    float result2 = dot_productLoop(a, b, length);
    tLoop2 = get_time_ns();

    free(a);
    free(b);

    *tVec = tVec2 - tVec1;
    *tLoop = tLoop2 - tLoop1;
}


int main() {
    int maxCnt = 5000;
    int64_t tVecs[maxCnt];
    int64_t tLoops[maxCnt];
    for (int i = 0;i<maxCnt;i++)
    {
        runTrial(i, &tVecs[i], &tLoops[i]);
        printf("length %d completed!\nVec: %ld, Loop: %ld\n", i, tVecs[i], tLoops[i]);
    }

    FILE* file;
    file = fopen("data.csv", "w");
    fprintf(file, "tVec,tLoop\n");
    for(int i = 0;i<maxCnt; i++)
    {
        fprintf(file, "%ld, %ld\n", tVecs[i], tLoops[i]);
    }
    // fprintf(file, "\n");
    // for(int i = 0;i<maxCnt; i++)
    // {
    //     fprintf(file, "%ld, ", tLoops[i]);
    // }
    // fprintf(file, "\n");
    fclose(file);

    return 0;
}

编译指令

gcc  -o dot_product vecCalc.c -O0 -mavx -lrt

结果展示

上图是向量化计算和纯循环计算的时间比较，可见有一些毛刺，尚且不清楚原因
下图是循环计算的时间/向量化计算的时间之商
结果拟合
向量化计算的斜率为0.9288296964491878，y轴的截距为172.55817372525428
循环计算的斜率为5.851229649697184， y轴截距为236.63629058187928
可见循环计算的斜率几乎是向量化计算的6.30倍