首先,声明本人用的是Windows 7操作系统,使用Windows 8操作系统的小伙伴们会启动不了Nsight monitor,原因在于Windows 8操作系统的Framework版本过新,解决办法可以是:安装一个版本旧一点的Matlab,安装起初会提示下载安装旧版本的Framework,安装完成Framework就可以停止安装Matlab了,Nsight monitor也可以启动了。
下面回到正题,如何在VS中查看PTX代码呢?在此举个例子,按照这个步骤便可以查看到了。
本人使用的代码是CUDA自带的一个Sample:
1. 确定不适用Nsight安全连接,即下图最后一项为False。
2. 启动Nsight monitor,选择“开始性能分析”
3. 选择Profile CUDA Application,并且将Collect information for CUDA Source View勾选上。
4. Launch!
5. 切换界面后,左上角选择Source and PTX。
6. 在视图中看到左边是源代码,右边绿色区域是对应某一行源代码的PTX代码。
其实,Nsight Monitor还是有很强大的功能的,感兴趣的小伙伴们可以自己翻翻手册和资料~
下面回到正题,如何在VS中查看PTX代码呢?在此举个例子,按照这个步骤便可以查看到了。
本人使用的代码是CUDA自带的一个Sample:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 | #include "cuda_runtime.h" #include "device_launch_parameters.h" #include <stdio.h> cudaError_t addWithCuda( int *c, const int *a, const int *b, unsigned int size); __global__ void addKernel( int *c, const int *a, const int *b) { int i = threadIdx.x; c[i] = a[i] + b[i]; } int main() { const int arraySize = 5; const int a[arraySize] = { 1, 2, 3, 4, 5 }; const int b[arraySize] = { 10, 20, 30, 40, 50 }; int c[arraySize] = { 0 }; // Add vectors in parallel. cudaError_t cudaStatus = addWithCuda(c, a, b, arraySize); if (cudaStatus != cudaSuccess) { fprintf (stderr, "addWithCuda failed!" ); return 1; } printf ( "{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\n" , c[0], c[1], c[2], c[3], c[4]); // cudaDeviceReset must be called before exiting in order for profiling and // tracing tools such as Nsight and Visual Profiler to show complete traces. cudaStatus = cudaDeviceReset(); if (cudaStatus != cudaSuccess) { fprintf (stderr, "cudaDeviceReset failed!" ); return 1; } return 0; } // Helper function for using CUDA to add vectors in parallel. cudaError_t addWithCuda( int *c, const int *a, const int *b, unsigned int size) { int *dev_a = 0; int *dev_b = 0; int *dev_c = 0; cudaError_t cudaStatus; // Choose which GPU to run on, change this on a multi-GPU system. cudaStatus = cudaSetDevice(0); if (cudaStatus != cudaSuccess) { fprintf (stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?" ); goto Error; } // Allocate GPU buffers for three vectors (two input, one output) . cudaStatus = cudaMalloc(( void **)&dev_c, size * sizeof ( int )); if (cudaStatus != cudaSuccess) { fprintf (stderr, "cudaMalloc failed!" ); goto Error; } cudaStatus = cudaMalloc(( void **)&dev_a, size * sizeof ( int )); if (cudaStatus != cudaSuccess) { fprintf (stderr, "cudaMalloc failed!" ); goto Error; } cudaStatus = cudaMalloc(( void **)&dev_b, size * sizeof ( int )); if (cudaStatus != cudaSuccess) { fprintf (stderr, "cudaMalloc failed!" ); goto Error; } // Copy input vectors from host memory to GPU buffers. cudaStatus = cudaMemcpy(dev_a, a, size * sizeof ( int ), cudaMemcpyHostToDevice); if (cudaStatus != cudaSuccess) { fprintf (stderr, "cudaMemcpy failed!" ); goto Error; } cudaStatus = cudaMemcpy(dev_b, b, size * sizeof ( int ), cudaMemcpyHostToDevice); if (cudaStatus != cudaSuccess) { fprintf (stderr, "cudaMemcpy failed!" ); goto Error; } // Launch a kernel on the GPU with one thread for each element. addKernel<<<1, size>>>(dev_c, dev_a, dev_b); // Check for any errors launching the kernel cudaStatus = cudaGetLastError(); if (cudaStatus != cudaSuccess) { fprintf (stderr, "addKernel launch failed: %s\n" , cudaGetErrorString(cudaStatus)); goto Error; } // cudaDeviceSynchronize waits for the kernel to finish, and returns // any errors encountered during the launch. cudaStatus = cudaDeviceSynchronize(); if (cudaStatus != cudaSuccess) { fprintf (stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n" , cudaStatus); goto Error; } // Copy output vector from GPU buffer to host memory. cudaStatus = cudaMemcpy(c, dev_c, size * sizeof ( int ), cudaMemcpyDeviceToHost); if (cudaStatus != cudaSuccess) { fprintf (stderr, "cudaMemcpy failed!" ); goto Error; } Error: cudaFree(dev_c); cudaFree(dev_a); cudaFree(dev_b); return cudaStatus; } |
1. 确定不适用Nsight安全连接,即下图最后一项为False。

2. 启动Nsight monitor,选择“开始性能分析”

3. 选择Profile CUDA Application,并且将Collect information for CUDA Source View勾选上。

4. Launch!
5. 切换界面后,左上角选择Source and PTX。

6. 在视图中看到左边是源代码,右边绿色区域是对应某一行源代码的PTX代码。

其实,Nsight Monitor还是有很强大的功能的,感兴趣的小伙伴们可以自己翻翻手册和资料~