VS查看PTX代码

article/2025/9/13 14:48:51

首先，声明本人用的是Windows 7操作系统，使用Windows 8操作系统的小伙伴们会启动不了Nsight monitor，原因在于Windows 8操作系统的Framework版本过新，解决办法可以是：安装一个版本旧一点的Matlab，安装起初会提示下载安装旧版本的Framework，安装完成Framework就可以停止安装Matlab了，Nsight monitor也可以启动了。
下面回到正题，如何在VS中查看PTX代码呢？在此举个例子，按照这个步骤便可以查看到了。

本人使用的代码是CUDA自带的一个Sample：

C/C++ code 
?
 #include "cuda_runtime.h" 
 #include "device_launch_parameters.h" 
   
 #include <stdio.h> 
   
 cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size); 
   
 __global__ void addKernel(int *c, const int *a, const int *b) 
 { 
     int i = threadIdx.x; 
     c[i] = a[i] + b[i]; 
 } 
   
 int main() 
 { 
     const int arraySize = 5; 
     const int a[arraySize] = { 1, 2, 3, 4, 5 }; 
     const int b[arraySize] = { 10, 20, 30, 40, 50 }; 
     int c[arraySize] = { 0 }; 
   
     // Add vectors in parallel. 
     cudaError_t cudaStatus = addWithCuda(c, a, b, arraySize); 
     if (cudaStatus != cudaSuccess) { 
         fprintf(stderr, "addWithCuda failed!"); 
         return 1; 
     } 
   
     printf("{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\n", 
         c[0], c[1], c[2], c[3], c[4]); 
   
     // cudaDeviceReset must be called before exiting in order for profiling and 
     // tracing tools such as Nsight and Visual Profiler to show complete traces. 
     cudaStatus = cudaDeviceReset(); 
     if (cudaStatus != cudaSuccess) { 
         fprintf(stderr, "cudaDeviceReset failed!"); 
         return 1; 
     } 
   
     return 0; 
 } 
   
 // Helper function for using CUDA to add vectors in parallel. 
 cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size) 
 { 
     int *dev_a = 0; 
     int *dev_b = 0; 
     int *dev_c = 0; 
     cudaError_t cudaStatus; 
   
     // Choose which GPU to run on, change this on a multi-GPU system. 
     cudaStatus = cudaSetDevice(0); 
     if (cudaStatus != cudaSuccess) { 
         fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?"); 
         goto Error; 
     } 
   
     // Allocate GPU buffers for three vectors (two input, one output)    . 
     cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int)); 
     if (cudaStatus != cudaSuccess) { 
         fprintf(stderr, "cudaMalloc failed!"); 
         goto Error; 
     } 
   
     cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int)); 
     if (cudaStatus != cudaSuccess) { 
         fprintf(stderr, "cudaMalloc failed!"); 
         goto Error; 
     } 
   
     cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int)); 
     if (cudaStatus != cudaSuccess) { 
         fprintf(stderr, "cudaMalloc failed!"); 
         goto Error; 
     } 
   
     // Copy input vectors from host memory to GPU buffers. 
     cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice); 
     if (cudaStatus != cudaSuccess) { 
         fprintf(stderr, "cudaMemcpy failed!"); 
         goto Error; 
     } 
   
     cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice); 
     if (cudaStatus != cudaSuccess) { 
         fprintf(stderr, "cudaMemcpy failed!"); 
         goto Error; 
     } 
   
     // Launch a kernel on the GPU with one thread for each element. 
     addKernel<<<1, size>>>(dev_c, dev_a, dev_b); 
   
     // Check for any errors launching the kernel 
     cudaStatus = cudaGetLastError(); 
     if (cudaStatus != cudaSuccess) { 
         fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus)); 
         goto Error; 
     } 
       
     // cudaDeviceSynchronize waits for the kernel to finish, and returns 
     // any errors encountered during the launch. 
     cudaStatus = cudaDeviceSynchronize(); 
     if (cudaStatus != cudaSuccess) { 
         fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus); 
         goto Error; 
     } 
   
     // Copy output vector from GPU buffer to host memory. 
     cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost); 
     if (cudaStatus != cudaSuccess) { 
         fprintf(stderr, "cudaMemcpy failed!"); 
         goto Error; 
     } 
   
 Error: 
     cudaFree(dev_c); 
     cudaFree(dev_a); 
     cudaFree(dev_b); 
       
     return cudaStatus; 
 }