ZLUDA丨如何在Intel的GPU上运行CUDA代码?

零、前言:

此demo的Github仓库(持续更新):github.com/Conqueror71…

?欢迎访问

个人博客:conqueror712.github.io/

知乎:www.zhihu.com/people/soeu…

Bilibili:space.bilibili.com/57089326

掘金:juejin.cn/user/129787…

有任何疏忽和错误欢迎各位读者指出!

一、一切从C++开始!

首先我们要得到一份C++代码,功能很简单,就是做加法:

注意这是Linux版本

#include <iostream>
#include <math.h>
#include <sys/time.h>
// function to add the elements of two arrays
void add(int n, float *x, float *y){
for (int i = 0; i < n; i++)
y[i] = x[i] + y[i];
}
int main(){
int N = 1<<25; // 30M elements
float *x = new float[N];
float *y = new float[N];
// initialize x and y arrays on the host
for (int i = 0; i < N; i++) {
x[i] = 1.0f;
y[i] = 2.0f;
}
struct timeval t1,t2;
double timeuse;
gettimeofday(&t1,NULL);
// Run kernel on 30M elements on the CPU
void add(N, x, y);
gettimeofday(&t2,NULL);
timeuse = (t2.tv_sec - t1.tv_sec) + (double)(t2.tv_usec - t1.tv_usec)/1000.0;
std::cout << "add(int, float*, float*) time: " << timeuse << "ms" << std::endl;
// Check for errors (all values should be 3.0f)
float maxError = 0.0f;
for (int i = 0; i < N; i++)
maxError = fmax(maxError, fabs(y[i]-3.0f));
std::cout << "Max error: " << maxError << std::endl;
// Free memory
delete [] x;
delete [] y;
return 0;
}
#include <iostream>


#include <math.h>


#include <sys/time.h>

​


// function to add the elements of two arrays

void add(int n, float *x, float *y){
  for (int i = 0; i < n; i++)
      y[i] = x[i] + y[i];
}
​
int main(){
  int N = 1<<25; // 30M elements
​
  float *x = new float[N];
  float *y = new float[N];
  // initialize x and y arrays on the host
  for (int i = 0; i < N; i++) {
    x[i] = 1.0f;
    y[i] = 2.0f;
  }
​
  struct timeval t1,t2;
  double timeuse;
  gettimeofday(&t1,NULL);
  // Run kernel on 30M elements on the CPU
  void add(N, x, y);
  gettimeofday(&t2,NULL);
  timeuse = (t2.tv_sec - t1.tv_sec) + (double)(t2.tv_usec - t1.tv_usec)/1000.0;
​
  std::cout << "add(int, float*, float*) time: " << timeuse << "ms" << std::endl;
  // Check for errors (all values should be 3.0f)
  float maxError = 0.0f;
  for (int i = 0; i < N; i++)
    maxError = fmax(maxError, fabs(y[i]-3.0f));
  std::cout << "Max error: " << maxError << std::endl;
​
  // Free memory
  delete [] x;
  delete [] y;
  
  return 0;
}
#include <iostream> #include <math.h> #include <sys/time.h> ​ // function to add the elements of two arrays void add(int n, float *x, float *y){  for (int i = 0; i < n; i++)      y[i] = x[i] + y[i]; } ​ int main(){  int N = 1<<25; // 30M elements ​  float *x = new float[N];  float *y = new float[N];  // initialize x and y arrays on the host  for (int i = 0; i < N; i++) {    x[i] = 1.0f;    y[i] = 2.0f; } ​  struct timeval t1,t2;  double timeuse;  gettimeofday(&t1,NULL);  // Run kernel on 30M elements on the CPU  void add(N, x, y);  gettimeofday(&t2,NULL);  timeuse = (t2.tv_sec - t1.tv_sec) + (double)(t2.tv_usec - t1.tv_usec)/1000.0; ​  std::cout << "add(int, float*, float*) time: " << timeuse << "ms" << std::endl;  // Check for errors (all values should be 3.0f)  float maxError = 0.0f;  for (int i = 0; i < N; i++)    maxError = fmax(maxError, fabs(y[i]-3.0f));  std::cout << "Max error: " << maxError << std::endl; ​  // Free memory  delete [] x;  delete [] y;    return 0; }

添加一些有趣的东西就可以变成CUDA代码:

  • __global__
  • cudaMallocManaged(&x, N*sizeof(float)); cudaMallocManaged(&y, N*sizeof(float));
  • add<<<1, 1>>>(N, x, y);
  • cudaFree(x); cudaFree(y);

1. Linux版本:

#include <iostream>
#include <math.h>
#include <sys/time.h>
// function to add the elemsents of two arrays
__global__
void add(int n, float *x, float *y){
for (int i = 0; i < n; i++)
y[i] = x[i] + y[i];
}
int main(){
int N = 1<<25; // 30M elements
float *x = new float[N];
float *y = new float[N];
cudaMallocManaged(&x, N*sizeof(float));
cudaMallocManaged(&y, N*sizeof(float));
// initialize x and y arrays on the host
for (int i = 0; i < N; i++) {
x[i] = 1.0f;
y[i] = 2.0f;
}
struct timeval t1,t2;
double timeuse;
gettimeofday(&t1,NULL);
// Run kernel on 30M elements on the CPU
add<<<1, 1>>>(N, x, y);
gettimeofday(&t2,NULL);
timeuse = (t2.tv_sec - t1.tv_sec) + (double)(t2.tv_usec - t1.tv_usec)/1000.0;
std::cout << "add(int, float*, float*) time: " << timeuse << "ms" << std::endl;
// Check for errors (all values should be 3.0f)
float maxError = 0.0f;
for (int i = 0; i < N; i++)
maxError = fmax(maxError, fabs(y[i]-3.0f));
std::cout << "Max error: " << maxError << std::endl;
// Free memory
delete [] x;
delete [] y;
cudaFree(x);
cudaFree(y);
return 0;
}
#include <iostream>


#include <math.h>


#include <sys/time.h>

​


// function to add the elemsents of two arrays
__global__

void add(int n, float *x, float *y){

  for (int i = 0; i < n; i++)

      y[i] = x[i] + y[i];

}

​

int main(){

  int N = 1<<25; // 30M elements

​

  float *x = new float[N];

  float *y = new float[N];

  cudaMallocManaged(&x, N*sizeof(float));

  cudaMallocManaged(&y, N*sizeof(float));

  // initialize x and y arrays on the host

  for (int i = 0; i < N; i++) {

    x[i] = 1.0f;

    y[i] = 2.0f;

  }

​

  struct timeval t1,t2;
  double timeuse;

  gettimeofday(&t1,NULL);
  // Run kernel on 30M elements on the CPU
  add<<<1, 1>>>(N, x, y);
  gettimeofday(&t2,NULL);
  timeuse = (t2.tv_sec - t1.tv_sec) + (double)(t2.tv_usec - t1.tv_usec)/1000.0;
​
  std::cout << "add(int, float*, float*) time: " << timeuse << "ms" << std::endl;
  // Check for errors (all values should be 3.0f)
  float maxError = 0.0f;
  for (int i = 0; i < N; i++)
    maxError = fmax(maxError, fabs(y[i]-3.0f));
  std::cout << "Max error: " << maxError << std::endl;
​
  // Free memory
  delete [] x;
  delete [] y;
​
  cudaFree(x);
  cudaFree(y);
  
  return 0;
}
#include <iostream> #include <math.h> #include <sys/time.h> ​ // function to add the elemsents of two arrays __global__ void add(int n, float *x, float *y){  for (int i = 0; i < n; i++)      y[i] = x[i] + y[i]; } ​ int main(){  int N = 1<<25; // 30M elements ​  float *x = new float[N];  float *y = new float[N];  cudaMallocManaged(&x, N*sizeof(float));  cudaMallocManaged(&y, N*sizeof(float));  // initialize x and y arrays on the host  for (int i = 0; i < N; i++) {    x[i] = 1.0f;    y[i] = 2.0f; } ​  struct timeval t1,t2;  double timeuse;  gettimeofday(&t1,NULL);  // Run kernel on 30M elements on the CPU  add<<<1, 1>>>(N, x, y);  gettimeofday(&t2,NULL);  timeuse = (t2.tv_sec - t1.tv_sec) + (double)(t2.tv_usec - t1.tv_usec)/1000.0; ​  std::cout << "add(int, float*, float*) time: " << timeuse << "ms" << std::endl;  // Check for errors (all values should be 3.0f)  float maxError = 0.0f;  for (int i = 0; i < N; i++)    maxError = fmax(maxError, fabs(y[i]-3.0f));  std::cout << "Max error: " << maxError << std::endl; ​  // Free memory  delete [] x;  delete [] y; ​  cudaFree(x);  cudaFree(y);    return 0; }

2. Windows版本:

#include <iostream>
#include <math.h>
#include <windows.h>
// function to add the elements of two arrays
__global__
void add(int n, float *x, float *y){
for (int i = 0; i < n; i++)
y[i] = x[i] + y[i];
}
int main(){
int N = 1<<25; // 30M elements
float *x = new float[N];
float *y = new float[N];
cudaMallocManaged(&x, N*sizeof(float));
cudaMallocManaged(&y, N*sizeof(float));
// initialize x and y arrays on the host
for (int i = 0; i < N; i++) {
x[i] = 1.0f;
y[i] = 2.0f;
}
ULONGLONG t1, t2, freq;
double timeuse;
QueryPerformanceFrequency((LARGE_INTEGER*)&freq);
QueryPerformanceCounter((LARGE_INTEGER*)&t1);
// Run kernel on 30M elements on the CPU
add<<<1, 1>>>(N, x, y);
QueryPerformanceCounter((LARGE_INTEGER*)&t2);
timeuse = (double)(t2 - t1) / (double)freq * 1000.0;
std::cout << "add(int, float*, float*) time: " << timeuse << "ms" << std::endl;
// Check for errors (all values should be 3.0f)
float maxError = 0.0f;
for (int i = 0; i < N; i++)
maxError = fmax(maxError, fabs(y[i]-3.0f));
std::cout << "Max error: " << maxError << std::endl;
// Free memory
delete [] x;
delete [] y;
cudaFree(x);
cudaFree(y);
return 0;
}
#include <iostream>


#include <math.h>


#include <windows.h>
​


// function to add the elements of two arrays

__global__

void add(int n, float *x, float *y){

  for (int i = 0; i < n; i++)

      y[i] = x[i] + y[i];

}

​

int main(){

  int N = 1<<25; // 30M elements

​

  float *x = new float[N];

  float *y = new float[N];

  cudaMallocManaged(&x, N*sizeof(float));

  cudaMallocManaged(&y, N*sizeof(float));

  // initialize x and y arrays on the host

  for (int i = 0; i < N; i++) {

    x[i] = 1.0f;

    y[i] = 2.0f;

  }

​

  ULONGLONG t1, t2, freq;
  double timeuse;

  QueryPerformanceFrequency((LARGE_INTEGER*)&freq);
  QueryPerformanceCounter((LARGE_INTEGER*)&t1);
  // Run kernel on 30M elements on the CPU
  add<<<1, 1>>>(N, x, y);
  QueryPerformanceCounter((LARGE_INTEGER*)&t2);
  timeuse = (double)(t2 - t1) / (double)freq * 1000.0;
​
  std::cout << "add(int, float*, float*) time: " << timeuse << "ms" << std::endl;
  // Check for errors (all values should be 3.0f)
  float maxError = 0.0f;
  for (int i = 0; i < N; i++)
    maxError = fmax(maxError, fabs(y[i]-3.0f));
  std::cout << "Max error: " << maxError << std::endl;
​
  // Free memory
  delete [] x;
  delete [] y;
​
  cudaFree(x);
  cudaFree(y);
  
  return 0;
}
#include <iostream> #include <math.h> #include <windows.h> ​ // function to add the elements of two arrays __global__ void add(int n, float *x, float *y){  for (int i = 0; i < n; i++)      y[i] = x[i] + y[i]; } ​ int main(){  int N = 1<<25; // 30M elements ​  float *x = new float[N];  float *y = new float[N];  cudaMallocManaged(&x, N*sizeof(float));  cudaMallocManaged(&y, N*sizeof(float));  // initialize x and y arrays on the host  for (int i = 0; i < N; i++) {    x[i] = 1.0f;    y[i] = 2.0f; } ​  ULONGLONG t1, t2, freq;  double timeuse;  QueryPerformanceFrequency((LARGE_INTEGER*)&freq);  QueryPerformanceCounter((LARGE_INTEGER*)&t1);  // Run kernel on 30M elements on the CPU  add<<<1, 1>>>(N, x, y);  QueryPerformanceCounter((LARGE_INTEGER*)&t2);  timeuse = (double)(t2 - t1) / (double)freq * 1000.0; ​  std::cout << "add(int, float*, float*) time: " << timeuse << "ms" << std::endl;  // Check for errors (all values should be 3.0f)  float maxError = 0.0f;  for (int i = 0; i < N; i++)    maxError = fmax(maxError, fabs(y[i]-3.0f));  std::cout << "Max error: " << maxError << std::endl; ​  // Free memory  delete [] x;  delete [] y; ​  cudaFree(x);  cudaFree(y);    return 0; }

二、一份CUDA代码如何在CPU上跑起来?

这个很简单,直接运行就可以了。

avatar

三、一份CUDA代码如何在NVidia-GPU上跑起来?

首先要保证电脑内的CUDA环境,这里我们只说1个Linux下配置CUDA环境的细节:

  • 下载CUDA的时候如果说有几个软件包无法下载,不妨加上-- fix-missing试一试,即sudo apt install nvidia-cuda-toolkit --fix-missing

然后按照如图所示的方式就可以跑起来啦!可以看到明显快了很多(笔者是GTX1650,附上截图)。

avatar

avatar

四、一份CUDA代码如何在ZLUDA上跑起来?

ZLUDA仓库:github.com/vosen/ZLUDA

这个比前面的复杂一些,网上的资料也比较少,但是经过一番探索还是摸索出来了方法:

我们需要的前置环境:

  • Visual Studio 2019(对没错,必须是2017~2019的版本才行,笔者一开始下了一个2022的结果不行)
  • Rust(下载最简单的版本就可以了,可以用cargo --versionrustc --version来检查是否安装成功)
  • Visual Studio的cl.exe需要添加至环境变量,具体这个文件在哪可以使用Everything搜索
  • Clone上述ZLUDA仓库到本地并编译,编译过程下文会展示

avatar

如上这种编译错误就是没有VS工具链导致的?

1. ZLUDA前置环境如何编译?

首先要进入ZLUDA安装目录,打开终端后执行cargo build --release

2. 如何把.cu文件编译成.exe文件呢?

只需要进入我们的项目目录下执行:nvcc -o my_app.exe my_app.cu

例如笔者就是:nvcc -o hello.exe windows-hello.cu

Linux:

LD_LIBRARY_PATH=<ZLUDA_DIRECTORY> <APPLICATION> <APPLICATIONS_ARGUMENTS>

3. 如何运行我们的代码?

按理来说随便进入一个目录就行,不过如果出错的话还是在ZLUDA目录下执行如下代码:

Windows:

<ZLUDA_DIRECTORY>\zluda_with.exe -- <APPLICATION> <APPLICATIONS_ARGUMENTS>

最后一项参数实测可以为空<APPLICATION>是你代码的.exe可执行文件,例如笔者就是:

D:\My_Files\Coding-Project-2023\OSPP\OSPP-THU-CUDA\Start\ZLUDA\ZLUDA\target\release\deps\zluda_with.exe -- hello.exe

avatar

比单纯的GPU还快!虽然这里没有控制变量,要控制的话应该在Linux下去测试,但是还是肉眼可见的快。

© 版权声明
THE END
喜欢就支持一下吧
点赞0

Warning: mysqli_query(): (HY000/3): Error writing file '/tmp/MYykFsFt' (Errcode: 28 - No space left on device) in /www/wwwroot/583.cn/wp-includes/class-wpdb.php on line 2345
admin的头像-五八三
评论 抢沙发
头像
欢迎您留下宝贵的见解!
提交
头像

昵称

图形验证码
取消
昵称代码图片