零、前言：

此demo的Github仓库（持续更新）：github.com/Conqueror71…

?欢迎访问：

个人博客：conqueror712.github.io/

知乎：www.zhihu.com/people/soeu…

Bilibili：space.bilibili.com/57089326

掘金：juejin.cn/user/129787…

有任何疏忽和错误欢迎各位读者指出！

一、一切从C++开始！

首先我们要得到一份C++代码，功能很简单，就是做加法：

注意这是Linux版本


#include <iostream>
#include <math.h>
#include <sys/time.h>
// function to add the elements of two arrays
void add(int n, float *x, float *y){
  for (int i = 0; i < n; i++)
      y[i] = x[i] + y[i];
}
int main(){
  int N = 1<<25; // 30M elements
  float *x = new float[N];
  float *y = new float[N];
  // initialize x and y arrays on the host
  for (int i = 0; i < N; i++) {
    x[i] = 1.0f;
    y[i] = 2.0f;
  }
  struct timeval t1,t2;
  double timeuse;
  gettimeofday(&t1,NULL);
  // Run kernel on 30M elements on the CPU
  void add(N, x, y);
  gettimeofday(&t2,NULL);
  timeuse = (t2.tv_sec - t1.tv_sec) + (double)(t2.tv_usec - t1.tv_usec)/1000.0;
  std::cout << "add(int, float*, float*) time: " << timeuse << "ms" << std::endl;
  // Check for errors (all values should be 3.0f)
  float maxError = 0.0f;
  for (int i = 0; i < N; i++)
    maxError = fmax(maxError, fabs(y[i]-3.0f));
  std::cout << "Max error: " << maxError << std::endl;
  // Free memory
  delete [] x;
  delete [] y;
  
  return 0;
}
#include <iostream>


#include <math.h>


#include <sys/time.h>




// function to add the elements of two arrays

void add(int n, float *x, float *y){
  for (int i = 0; i < n; i++)
      y[i] = x[i] + y[i];
}

int main(){
  int N = 1<<25; // 30M elements

  float *x = new float[N];
  float *y = new float[N];
  // initialize x and y arrays on the host
  for (int i = 0; i < N; i++) {
    x[i] = 1.0f;
    y[i] = 2.0f;
  }

  struct timeval t1,t2;
  double timeuse;
  gettimeofday(&t1,NULL);
  // Run kernel on 30M elements on the CPU
  void add(N, x, y);
  gettimeofday(&t2,NULL);
  timeuse = (t2.tv_sec - t1.tv_sec) + (double)(t2.tv_usec - t1.tv_usec)/1000.0;

  std::cout << "add(int, float*, float*) time: " << timeuse << "ms" << std::endl;
  // Check for errors (all values should be 3.0f)
  float maxError = 0.0f;
  for (int i = 0; i < N; i++)
    maxError = fmax(maxError, fabs(y[i]-3.0f));
  std::cout << "Max error: " << maxError << std::endl;

  // Free memory
  delete [] x;
  delete [] y;
  
  return 0;
}
#include <iostream>


#include <math.h>


#include <sys/time.h>




// function to add the elements of two arrays

void add(int n, float *x, float *y){
  for (int i = 0; i < n; i++)
      y[i] = x[i] + y[i];
}

int main(){
  int N = 1<<25; // 30M elements

  float *x = new float[N];
  float *y = new float[N];
  // initialize x and y arrays on the host
  for (int i = 0; i < N; i++) {
    x[i] = 1.0f;
    y[i] = 2.0f;
  }

  struct timeval t1,t2;
  double timeuse;
  gettimeofday(&t1,NULL);
  // Run kernel on 30M elements on the CPU
  void add(N, x, y);
  gettimeofday(&t2,NULL);
  timeuse = (t2.tv_sec - t1.tv_sec) + (double)(t2.tv_usec - t1.tv_usec)/1000.0;

  std::cout << "add(int, float*, float*) time: " << timeuse << "ms" << std::endl;
  // Check for errors (all values should be 3.0f)
  float maxError = 0.0f;
  for (int i = 0; i < N; i++)
    maxError = fmax(maxError, fabs(y[i]-3.0f));
  std::cout << "Max error: " << maxError << std::endl;

  // Free memory
  delete [] x;
  delete [] y;
  
  return 0;
}

添加一些有趣的东西就可以变成CUDA代码：

__global__
cudaMallocManaged(&x, N*sizeof(float)); cudaMallocManaged(&y, N*sizeof(float));
add<<<1, 1>>>(N, x, y);
cudaFree(x); cudaFree(y);

1. Linux版本：


#include <iostream>
#include <math.h>
#include <sys/time.h>
// function to add the elemsents of two arrays
__global__
void add(int n, float *x, float *y){
  for (int i = 0; i < n; i++)
      y[i] = x[i] + y[i];
}
int main(){
  int N = 1<<25; // 30M elements
  float *x = new float[N];
  float *y = new float[N];
  cudaMallocManaged(&x, N*sizeof(float));
  cudaMallocManaged(&y, N*sizeof(float));
  // initialize x and y arrays on the host
  for (int i = 0; i < N; i++) {
    x[i] = 1.0f;
    y[i] = 2.0f;
  }
  struct timeval t1,t2;
  double timeuse;
  gettimeofday(&t1,NULL);
  // Run kernel on 30M elements on the CPU
  add<<<1, 1>>>(N, x, y);
  gettimeofday(&t2,NULL);
  timeuse = (t2.tv_sec - t1.tv_sec) + (double)(t2.tv_usec - t1.tv_usec)/1000.0;
  std::cout << "add(int, float*, float*) time: " << timeuse << "ms" << std::endl;
  // Check for errors (all values should be 3.0f)
  float maxError = 0.0f;
  for (int i = 0; i < N; i++)
    maxError = fmax(maxError, fabs(y[i]-3.0f));
  std::cout << "Max error: " << maxError << std::endl;
  // Free memory
  delete [] x;
  delete [] y;
  cudaFree(x);
  cudaFree(y);
  
  return 0;
}
#include <iostream>


#include <math.h>


#include <sys/time.h>




// function to add the elemsents of two arrays
__global__

void add(int n, float *x, float *y){

  for (int i = 0; i < n; i++)

      y[i] = x[i] + y[i];

}



int main(){

  int N = 1<<25; // 30M elements



  float *x = new float[N];

  float *y = new float[N];

  cudaMallocManaged(&x, N*sizeof(float));

  cudaMallocManaged(&y, N*sizeof(float));

  // initialize x and y arrays on the host

  for (int i = 0; i < N; i++) {

    x[i] = 1.0f;

    y[i] = 2.0f;

  }



  struct timeval t1,t2;
  double timeuse;

  gettimeofday(&t1,NULL);
  // Run kernel on 30M elements on the CPU
  add<<<1, 1>>>(N, x, y);
  gettimeofday(&t2,NULL);
  timeuse = (t2.tv_sec - t1.tv_sec) + (double)(t2.tv_usec - t1.tv_usec)/1000.0;

  std::cout << "add(int, float*, float*) time: " << timeuse << "ms" << std::endl;
  // Check for errors (all values should be 3.0f)
  float maxError = 0.0f;
  for (int i = 0; i < N; i++)
    maxError = fmax(maxError, fabs(y[i]-3.0f));
  std::cout << "Max error: " << maxError << std::endl;

  // Free memory
  delete [] x;
  delete [] y;

  cudaFree(x);
  cudaFree(y);
  
  return 0;
}
#include <iostream>


#include <math.h>


#include <sys/time.h>




// function to add the elemsents of two arrays
__global__

void add(int n, float *x, float *y){

  for (int i = 0; i < n; i++)

      y[i] = x[i] + y[i];

}



int main(){

  int N = 1<<25; // 30M elements



  float *x = new float[N];

  float *y = new float[N];

  cudaMallocManaged(&x, N*sizeof(float));

  cudaMallocManaged(&y, N*sizeof(float));

  // initialize x and y arrays on the host

  for (int i = 0; i < N; i++) {

    x[i] = 1.0f;

    y[i] = 2.0f;

  }



  struct timeval t1,t2;
  double timeuse;

  gettimeofday(&t1,NULL);
  // Run kernel on 30M elements on the CPU
  add<<<1, 1>>>(N, x, y);
  gettimeofday(&t2,NULL);
  timeuse = (t2.tv_sec - t1.tv_sec) + (double)(t2.tv_usec - t1.tv_usec)/1000.0;

  std::cout << "add(int, float*, float*) time: " << timeuse << "ms" << std::endl;
  // Check for errors (all values should be 3.0f)
  float maxError = 0.0f;
  for (int i = 0; i < N; i++)
    maxError = fmax(maxError, fabs(y[i]-3.0f));
  std::cout << "Max error: " << maxError << std::endl;

  // Free memory
  delete [] x;
  delete [] y;

  cudaFree(x);
  cudaFree(y);
  
  return 0;
}

2. Windows版本：


#include <iostream>
#include <math.h>
#include <windows.h>
// function to add the elements of two arrays
__global__
void add(int n, float *x, float *y){
  for (int i = 0; i < n; i++)
      y[i] = x[i] + y[i];
}
int main(){
  int N = 1<<25; // 30M elements
  float *x = new float[N];
  float *y = new float[N];
  cudaMallocManaged(&x, N*sizeof(float));
  cudaMallocManaged(&y, N*sizeof(float));
  // initialize x and y arrays on the host
  for (int i = 0; i < N; i++) {
    x[i] = 1.0f;
    y[i] = 2.0f;
  }
  ULONGLONG t1, t2, freq;
  double timeuse;
  QueryPerformanceFrequency((LARGE_INTEGER*)&freq);
  QueryPerformanceCounter((LARGE_INTEGER*)&t1);
  // Run kernel on 30M elements on the CPU
  add<<<1, 1>>>(N, x, y);
  QueryPerformanceCounter((LARGE_INTEGER*)&t2);
  timeuse = (double)(t2 - t1) / (double)freq * 1000.0;
  std::cout << "add(int, float*, float*) time: " << timeuse << "ms" << std::endl;
  // Check for errors (all values should be 3.0f)
  float maxError = 0.0f;
  for (int i = 0; i < N; i++)
    maxError = fmax(maxError, fabs(y[i]-3.0f));
  std::cout << "Max error: " << maxError << std::endl;
  // Free memory
  delete [] x;
  delete [] y;
  cudaFree(x);
  cudaFree(y);
  
  return 0;
}
#include <iostream>


#include <math.h>


#include <windows.h>



// function to add the elements of two arrays

__global__

void add(int n, float *x, float *y){

  for (int i = 0; i < n; i++)

      y[i] = x[i] + y[i];

}



int main(){

  int N = 1<<25; // 30M elements



  float *x = new float[N];

  float *y = new float[N];

  cudaMallocManaged(&x, N*sizeof(float));

  cudaMallocManaged(&y, N*sizeof(float));

  // initialize x and y arrays on the host

  for (int i = 0; i < N; i++) {

    x[i] = 1.0f;

    y[i] = 2.0f;

  }



  ULONGLONG t1, t2, freq;
  double timeuse;

  QueryPerformanceFrequency((LARGE_INTEGER*)&freq);
  QueryPerformanceCounter((LARGE_INTEGER*)&t1);
  // Run kernel on 30M elements on the CPU
  add<<<1, 1>>>(N, x, y);
  QueryPerformanceCounter((LARGE_INTEGER*)&t2);
  timeuse = (double)(t2 - t1) / (double)freq * 1000.0;

  std::cout << "add(int, float*, float*) time: " << timeuse << "ms" << std::endl;
  // Check for errors (all values should be 3.0f)
  float maxError = 0.0f;
  for (int i = 0; i < N; i++)
    maxError = fmax(maxError, fabs(y[i]-3.0f));
  std::cout << "Max error: " << maxError << std::endl;

  // Free memory
  delete [] x;
  delete [] y;

  cudaFree(x);
  cudaFree(y);
  
  return 0;
}
#include <iostream>


#include <math.h>


#include <windows.h>



// function to add the elements of two arrays

__global__

void add(int n, float *x, float *y){

  for (int i = 0; i < n; i++)

      y[i] = x[i] + y[i];

}



int main(){

  int N = 1<<25; // 30M elements



  float *x = new float[N];

  float *y = new float[N];

  cudaMallocManaged(&x, N*sizeof(float));

  cudaMallocManaged(&y, N*sizeof(float));

  // initialize x and y arrays on the host

  for (int i = 0; i < N; i++) {

    x[i] = 1.0f;

    y[i] = 2.0f;

  }



  ULONGLONG t1, t2, freq;
  double timeuse;

  QueryPerformanceFrequency((LARGE_INTEGER*)&freq);
  QueryPerformanceCounter((LARGE_INTEGER*)&t1);
  // Run kernel on 30M elements on the CPU
  add<<<1, 1>>>(N, x, y);
  QueryPerformanceCounter((LARGE_INTEGER*)&t2);
  timeuse = (double)(t2 - t1) / (double)freq * 1000.0;

  std::cout << "add(int, float*, float*) time: " << timeuse << "ms" << std::endl;
  // Check for errors (all values should be 3.0f)
  float maxError = 0.0f;
  for (int i = 0; i < N; i++)
    maxError = fmax(maxError, fabs(y[i]-3.0f));
  std::cout << "Max error: " << maxError << std::endl;

  // Free memory
  delete [] x;
  delete [] y;

  cudaFree(x);
  cudaFree(y);
  
  return 0;
}