{"id":22,"date":"2016-05-31T22:11:15","date_gmt":"2016-05-31T14:11:15","guid":{"rendered":"http:\/\/himmel.tech\/wp\/?p=22"},"modified":"2017-02-25T16:50:47","modified_gmt":"2017-02-25T08:50:47","slug":"cuda%e5%88%9d%e6%8e%a2","status":"publish","type":"post","link":"https:\/\/himmel.tech\/wp\/?p=22","title":{"rendered":"CUDA\u521d\u63a2"},"content":{"rendered":"<p>\u4e45\u95fbCUDA\u5927\u540d\uff0c\u6070\u597d\u4eca\u5929\u8981\u5199\u6570\u5b57\u56fe\u50cf\u5904\u7406\u7684\u4f5c\u4e1a\uff0c\u4e0d\u59a8\u6765\u8bd5\u4e00\u8bd5\u3002<\/p>\n<p>\u9996\u5148\u7ed9\u51fa\u6211\u770b\u7684\u4e00\u7bc7\u53c2\u8003\u6587\u7ae0\uff1ahttp:\/\/www.mamicode.com\/info-detail-327339.html<\/p>\n<p>\u622a\u81f3\u76ee\u524d\uff0cCUDA\u7684SDK\u7248\u672c\u53f7\u4e3a7.5\uff08\u6b63\u5f0f\u7248\uff09\u548c8.0\uff08RC\uff09\u3002\u63a8\u8350\u4f7f\u75287.5\uff0c\u4f46\u6b63\u5f0f\u7248\u4e0d\u652f\u6301Visual Studio 2015\uff0c\u53ea\u597d\u7528\u4e868.0\u30028.0\u4e0b\u8f7d\u9700\u8981\u6ce8\u518c\uff0c\u800c\u4e14\u4e0b\u8f7d\u65f6\u4f1a\u51fa\u73b0\u95ee\u9898\u8fdb\u5ea6\u5361\u572860%\uff0c\u8c8c\u4f3c\u662f\u56e0\u4e3a\u94fe\u63a5\u6709\u751f\u5b58\u65f6\u95f4\uff0c\u800c\u6211\u7684\u7f51\u901f\u4e0d\u591f\u5feb\u2026\u2026\u540e\u6765\u53d1\u73b0\u5148wget\u5230\u670d\u52a1\u5668\u4e0a\u518d\u4ece\u670d\u52a1\u5668\u4e0b\u8f7d\u5230\u672c\u5730\u5c31\u53ef\u4ee5\u4e86\u3002<br \/>\n<!--more--><\/p>\n<p>OK\uff0c\u5b89\u88c5\u8fc7\u7a0b\u5f88\u50bb\u74dc\u5316\u7684\uff0c\u4e00\u8defnext\u6ce8\u610f\u4e0b\u8b66\u544a\u5c31\u597d\u3002\u5b89\u88c5\u5b8c\u6bd5\u540e\u8fdbVisual Studio\u65b0\u5efa\u9879\u76ee\uff0c\u53d1\u73b0\u591a\u51fa\u6765\u4e86CUDA\u8fd9\u4e00\u9879\u3002\u5c31\u662f\u8fd9\u4e2a\uff1a<\/p>\n<p><img loading=\"lazy\" decoding=\"async\" class=\"alignnone\" src=\"http:\/\/himmel.tech\/wp\/wp-content\/uploads\/2016\/05\/1f4ba022a702c39eba27834c9abc966e.png\" width=\"708\" height=\"491\" \/><\/p>\n<p>\u76f4\u63a5\u786e\u5b9a\u5c31\u597d\u4e86\uff0c\u6ca1\u4ec0\u4e48\u591a\u4f59\u7684\u8bbe\u7f6e\u3002\u5efa\u7acb\u65b0\u5de5\u7a0b\u540e\uff0c\u81ea\u5e26\u4e86\u4e00\u4e2a\u4f8b\u7a0b\uff0c\u628a\u4e24\u4e2a\u5411\u91cf\u76f8\u52a0\u6c42\u548c\u7684\u5c0f\u7a0b\u5e8f\u3002\u76f4\u63a5\u7f16\u8bd1\u8fd0\u884c\uff0c\u82e5\u80fd\u591f\u6b63\u5e38\u8fd0\u884c\uff0c\u5219\u8bf4\u660e\u73af\u5883\u6ca1\u6709\u95ee\u9898\u4e86\u3002<\/p>\n<p><img decoding=\"async\" src=\"http:\/\/himmel.tech\/wp\/wp-content\/uploads\/2016\/05\/995a929ef5fb3725aba24035bc842e75.png\" \/><\/p>\n<p>\u987a\u4fbf\u4e00\u63d0\uff0c\u5728\u5b89\u88c5\u597d\u4e86CUDA Toolkit\u540e\uff0cVS\u7684\u83dc\u5355\u680f\u4e0a\u5e94\u8be5\u4f1a\u591a\u51fa\u6765\u4e00\u4e2aNsight\uff0c\u5b83\u662f\u7528\u6765\u8c03\u8bd5\u5185\u6838\u7a0b\u5e8f\u7528\u7684\uff0c\u91cc\u9762\u6709\u4e00\u4e2aSystem Info\uff0c\u53ef\u4ee5\u770b\u5230\u672c\u673a\u7684\u4e00\u4e9b\u5c5e\u6027\u3002\u671f\u95f4\u4f1a\u63d0\u793a\u8fde\u63a5\u4e0d\u5b89\u5168\uff0c\u5ffd\u7565\u5373\u53ef\u3002\u76f4\u63a5\u6253\u5230CUDA Devices\u9875\u9762<\/p>\n<p><img decoding=\"async\" src=\"http:\/\/himmel.tech\/wp\/wp-content\/uploads\/2016\/05\/058c0064a60b0e81497e20fa2e086814.png\" \/><\/p>\n<p>\u597d\u7684\uff0c\u8fd9\u4e2aMAX_THREADS_PER_BLOCK\u53c2\u6570\u975e\u5e38\u91cd\u8981\uff0c\u4ee5\u53ca\u4e0a\u9762\u7684MAX_BLOCK_DIM_X\/Y\/Z\u4e5f\u5f88\u91cd\u8981\uff0c\u8fd9\u5c06\u5173\u7cfb\u5230\u4f60\u5982\u4f55\u5212\u5206\u4efb\u52a1\u3002<\/p>\n<p>\u5148\u6765\u770b\u770b\u8fd9\u6bb5\u4f8b\u7a0b\u5427\uff0c\u65b9\u4fbf\u8d77\u89c1\u76f4\u63a5\u7c98\u8d34\u8fc7\u6765\u4e86<\/p>\n<p>[code language=&#8221;cpp&#8221;]<\/p>\n<p>#include &quot;cuda_runtime.h&quot;<br \/>\n#include &quot;device_launch_parameters.h&quot;<\/p>\n<p>#include &lt;stdio.h&gt;<\/p>\n<p>cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size);<\/p>\n<p><strong>global<\/strong> void addKernel(int *c, const int *a, const int *b)<br \/>\n{<br \/>\n    int i = threadIdx.x;<br \/>\n    c[i] = a[i] + b[i];<br \/>\n}<\/p>\n<p>int main()<br \/>\n{<br \/>\n    const int arraySize = 5;<br \/>\n    const int a[arraySize] = { 1, 2, 3, 4, 5 };<br \/>\n    const int b[arraySize] = { 10, 20, 30, 40, 50 };<br \/>\n    int c[arraySize] = { 0 };<\/p>\n<pre><code>\/\/ Add vectors in parallel.\ncudaError_t cudaStatus = addWithCuda(c, a, b, arraySize);\nif (cudaStatus != cudaSuccess) {\n    fprintf(stderr, &amp;quot;addWithCuda failed!&amp;quot;);\n    return 1;\n}\n\nprintf(&amp;quot;{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\\n&amp;quot;,\n    c[0], c[1], c[2], c[3], c[4]);\n\n\/\/ cudaDeviceReset must be called before exiting in order for profiling and\n\/\/ tracing tools such as Nsight and Visual Profiler to show complete traces.\ncudaStatus = cudaDeviceReset();\nif (cudaStatus != cudaSuccess) {\n    fprintf(stderr, &amp;quot;cudaDeviceReset failed!&amp;quot;);\n    return 1;\n}\n\nreturn 0;\n<\/code><\/pre>\n<p>}<\/p>\n<p>\/\/ Helper function for using CUDA to add vectors in parallel.<br \/>\ncudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size)<br \/>\n{<br \/>\n    int *dev_a = 0;<br \/>\n    int *dev_b = 0;<br \/>\n    int *dev_c = 0;<br \/>\n    cudaError_t cudaStatus;<\/p>\n<pre><code>\/\/ Choose which GPU to run on, change this on a multi-GPU system.\ncudaStatus = cudaSetDevice(0);\nif (cudaStatus != cudaSuccess) {\n    fprintf(stderr, &amp;quot;cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?&amp;quot;);\n    goto Error;\n}\n\n\/\/ Allocate GPU buffers for three vectors (two input, one output)    .\ncudaStatus = cudaMalloc((void**)&amp;amp;dev_c, size * sizeof(int));\nif (cudaStatus != cudaSuccess) {\n    fprintf(stderr, &amp;quot;cudaMalloc failed!&amp;quot;);\n    goto Error;\n}\n\ncudaStatus = cudaMalloc((void**)&amp;amp;dev_a, size * sizeof(int));\nif (cudaStatus != cudaSuccess) {\n    fprintf(stderr, &amp;quot;cudaMalloc failed!&amp;quot;);\n    goto Error;\n}\n\ncudaStatus = cudaMalloc((void**)&amp;amp;dev_b, size * sizeof(int));\nif (cudaStatus != cudaSuccess) {\n    fprintf(stderr, &amp;quot;cudaMalloc failed!&amp;quot;);\n    goto Error;\n}\n\n\/\/ Copy input vectors from host memory to GPU buffers.\ncudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);\nif (cudaStatus != cudaSuccess) {\n    fprintf(stderr, &amp;quot;cudaMemcpy failed!&amp;quot;);\n    goto Error;\n}\n\ncudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);\nif (cudaStatus != cudaSuccess) {\n    fprintf(stderr, &amp;quot;cudaMemcpy failed!&amp;quot;);\n    goto Error;\n}\n\n\/\/ Launch a kernel on the GPU with one thread for each element.\naddKernel&amp;lt;&amp;lt;&amp;lt;1, size&amp;gt;&amp;gt;&amp;gt;(dev_c, dev_a, dev_b);\n\n\/\/ Check for any errors launching the kernel\ncudaStatus = cudaGetLastError();\nif (cudaStatus != cudaSuccess) {\n    fprintf(stderr, &amp;quot;addKernel launch failed: %s\\n&amp;quot;, cudaGetErrorString(cudaStatus));\n    goto Error;\n}\n\n\/\/ cudaDeviceSynchronize waits for the kernel to finish, and returns\n\/\/ any errors encountered during the launch.\ncudaStatus = cudaDeviceSynchronize();\nif (cudaStatus != cudaSuccess) {\n    fprintf(stderr, &amp;quot;cudaDeviceSynchronize returned error code %d after launching addKernel!\\n&amp;quot;, cudaStatus);\n    goto Error;\n}\n\n\/\/ Copy output vector from GPU buffer to host memory.\ncudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);\nif (cudaStatus != cudaSuccess) {\n    fprintf(stderr, &amp;quot;cudaMemcpy failed!&amp;quot;);\n    goto Error;\n}\n<\/code><\/pre>\n<p>Error:<br \/>\n    cudaFree(dev_c);<br \/>\n    cudaFree(dev_a);<br \/>\n    cudaFree(dev_b);<\/p>\n<pre><code>return cudaStatus;\n<\/code><\/pre>\n<p>}<\/p>\n<p>[\/code]<\/p>\n<p>\u597d\u4e86\uff0c\u8fd9\u5c31\u662fCUDA\u5904\u7406\u95ee\u9898\u7684\u57fa\u672c\u6d41\u7a0b\uff1a\u8bbe\u5907\u521d\u59cb\u5316-&gt;\u5206\u914d\u5185\u5b58-&gt;\u5b58\u5165\u6570\u636e-&gt;\u542f\u52a8\u5185\u6838\u7a0b\u5e8f-&gt;\u7b49\u5f85\u8ba1\u7b97\u7ed3\u675f-&gt;\u53d6\u51fa\u6570\u636e-&gt;\u9500\u6bc1\u5185\u5b58\u3002<\/p>\n<p>\u5148\u770b\u4e3b\u51fd\u6570<\/p>\n<p>[code language=&#8221;cpp&#8221;]<br \/>\nint main()<br \/>\n{<br \/>\n    const int arraySize = 5;<br \/>\n    const int a[arraySize] = { 1, 2, 3, 4, 5 };<br \/>\n    const int b[arraySize] = { 10, 20, 30, 40, 50 };<br \/>\n    int c[arraySize] = { 0 };<\/p>\n<pre><code>\/\/ Add vectors in parallel.\ncudaError_t cudaStatus = addWithCuda(c, a, b, arraySize);\nif (cudaStatus != cudaSuccess) {\n    fprintf(stderr, &amp;quot;addWithCuda failed!&amp;quot;);\n    return 1;\n}\n\nprintf(&amp;quot;{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\\n&amp;quot;,\n    c[0], c[1], c[2], c[3], c[4]);\n\n\/\/ cudaDeviceReset must be called before exiting in order for profiling and\n\/\/ tracing tools such as Nsight and Visual Profiler to show complete traces.\ncudaStatus = cudaDeviceReset();\nif (cudaStatus != cudaSuccess) {\n    fprintf(stderr, &amp;quot;cudaDeviceReset failed!&amp;quot;);\n    return 1;\n}\n\nreturn 0;\n<\/code><\/pre>\n<p>}<\/p>\n<p>[\/code]<\/p>\n<p>\u663e\u7136\uff0c\u8fd9\u4e2a\u4e3b\u7a0b\u5e8f\u6ca1\u6709\u505a\u592a\u591a\u7684\u5de5\u4f5c\uff0c\u5b83\u53ea\u662f\u521d\u59cb\u5316\u4e86\u4e09\u4e2a\u5411\u91cf\uff0c\u7136\u540e\u628a\u5de5\u4f5c\u4e22\u7ed9\u4e86addWithCuda\uff0c\u5e76\u901a\u8fc7\u8fd4\u56de\u503c\u6765\u63a5\u6536\u9519\u8bef\u4fe1\u606f\u3002\u6240\u4ee5\u6211\u4eec\u7ee7\u7eed\u770b\u8fd9\u4e2aaddWithCuda\u51fd\u6570\u3002<\/p>\n<p>[code language=&#8221;cpp&#8221;]<\/p>\n<p>cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size)<br \/>\n{<br \/>\n    int *dev_a = 0;<br \/>\n    int *dev_b = 0;<br \/>\n    int *dev_c = 0;<br \/>\n    cudaError_t cudaStatus;<\/p>\n<pre><code>\/\/ Choose which GPU to run on, change this on a multi-GPU system.\ncudaStatus = cudaSetDevice(0);\nif (cudaStatus != cudaSuccess) {\n    fprintf(stderr, &amp;quot;cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?&amp;quot;);\n    goto Error;\n}\n\n\/\/ Allocate GPU buffers for three vectors (two input, one output)    .\ncudaStatus = cudaMalloc((void**)&amp;amp;dev_c, size * sizeof(int));\nif (cudaStatus != cudaSuccess) {\n    fprintf(stderr, &amp;quot;cudaMalloc failed!&amp;quot;);\n    goto Error;\n}\n\ncudaStatus = cudaMalloc((void**)&amp;amp;dev_a, size * sizeof(int));\nif (cudaStatus != cudaSuccess) {\n    fprintf(stderr, &amp;quot;cudaMalloc failed!&amp;quot;);\n    goto Error;\n}\n\ncudaStatus = cudaMalloc((void**)&amp;amp;dev_b, size * sizeof(int));\nif (cudaStatus != cudaSuccess) {\n    fprintf(stderr, &amp;quot;cudaMalloc failed!&amp;quot;);\n    goto Error;\n}\n\n\/\/ Copy input vectors from host memory to GPU buffers.\ncudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);\nif (cudaStatus != cudaSuccess) {\n    fprintf(stderr, &amp;quot;cudaMemcpy failed!&amp;quot;);\n    goto Error;\n}\n\ncudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);\nif (cudaStatus != cudaSuccess) {\n    fprintf(stderr, &amp;quot;cudaMemcpy failed!&amp;quot;);\n    goto Error;\n}\n\n\/\/ Launch a kernel on the GPU with one thread for each element.\naddKernel&amp;lt;&amp;lt;&amp;lt;1, size&amp;gt;&amp;gt;&amp;gt;(dev_c, dev_a, dev_b);\n\n\/\/ Check for any errors launching the kernel\ncudaStatus = cudaGetLastError();\nif (cudaStatus != cudaSuccess) {\n    fprintf(stderr, &amp;quot;addKernel launch failed: %s\\n&amp;quot;, cudaGetErrorString(cudaStatus));\n    goto Error;\n}\n\n\/\/ cudaDeviceSynchronize waits for the kernel to finish, and returns\n\/\/ any errors encountered during the launch.\ncudaStatus = cudaDeviceSynchronize();\nif (cudaStatus != cudaSuccess) {\n    fprintf(stderr, &amp;quot;cudaDeviceSynchronize returned error code %d after launching addKernel!\\n&amp;quot;, cudaStatus);\n    goto Error;\n}\n\n\/\/ Copy output vector from GPU buffer to host memory.\ncudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);\nif (cudaStatus != cudaSuccess) {\n    fprintf(stderr, &amp;quot;cudaMemcpy failed!&amp;quot;);\n    goto Error;\n}\n<\/code><\/pre>\n<p>Error:<br \/>\n    cudaFree(dev_c);<br \/>\n    cudaFree(dev_a);<br \/>\n    cudaFree(dev_b);<\/p>\n<pre><code>return cudaStatus;\n<\/code><\/pre>\n<p>}<\/p>\n<p>[\/code]<\/p>\n<p>\u597d\u5427\uff0c\u5c45\u7136\u7528\u4e86goto\uff0c\u4f46\u95ee\u9898\u5e94\u8be5\u4e5f\u4e0d\u5927\u3002\u6211\u4eec\u6765\u770b\u770b\u8fd9\u6bb5\u7a0b\u5e8f\u505a\u4e86\u5565\uff1a<\/p>\n<p>\u9996\u5148\uff0ccudaSetDevice\u8bbe\u7f6e\u4e86\u4f7f\u7528\u54ea\u4e2a\u8bbe\u5907\u8fdb\u884c\u8fd0\u7b97\uff1b<\/p>\n<p>cudaMalloc\u5206\u914d\u4e86\u8bbe\u5907\u5185\u5b58\uff08\u6ce8\u610f\uff0c\u6b64\u6307\u9488\u4ec5\u5728\u5185\u6838\u7a0b\u5e8f\u4e2d\u6709\u6548\uff0c\u6307\u5411\u4e00\u4e2a\u663e\u5b58\u5355\u5143\uff0c\u5728\u4e3b\u673a\u7a0b\u5e8f\u90e8\u5206\u65e0\u6548\uff09\uff1b<\/p>\n<p>cudaMemcpy\u7528\u4e8e\u4f20\u9001\u6570\u636e\uff1b<\/p>\n<p>\u4e4b\u540e\u8c03\u7528\u5185\u6838\u51fd\u6570\uff0c\u8981\u6ce8\u610f\uff0c\u5185\u6838\u51fd\u6570\u7684\u8c03\u7528\u4e0e\u4e3b\u673a\u51fd\u6570\u7565\u6709\u4e0d\u540c\uff0c\u662f\u8fd9\u6837\u5b50\u7684<\/p>\n<p>addKernel&lt;&lt;&lt;1, size&gt;&gt;&gt;(dev_c, dev_a, dev_b);<\/p>\n<p>\u591a\u4e86\u4e00\u4e2a&lt;&lt;&lt;Block_Num, Thread Num&gt;&gt;&gt;\uff0c\u8fd9\u4e24\u4e2a\u53c2\u6570\u7528\u4e8e\u63cf\u8ff0\u4efb\u52a1\u5e94\u5f53\u600e\u4e48\u5206\u914d\u5230\u5185\u6838\u4e0a\u9762\u53bb\uff0c\u8fd9\u90e8\u5206\u4e4b\u540e\u89e3\u91ca\u3002<\/p>\n<p>\u968f\u540e\u662f\u4e00\u4e2acudaDeviceSynchronize\uff0c\u8fd9\u4e2a\u7528\u4e8e\u7b49\u5f85\u5185\u6838\u51fd\u6570\u6267\u884c\u5b8c\u6bd5\u3002<\/p>\n<p>\u4e4b\u540e\u53c8\u662fcudaMemcpy\uff0c\u4ee5\u53cacudaFree\uff0c\u7528\u4e8e\u628a\u6570\u636e\u590d\u5236\u56de\u6765\u5e76\u91ca\u653e\u5185\u5b58\u3002<\/p>\n<p>OK\uff0c\u8fd9\u5c31\u662fCUDA\u6700\u6700\u6700\u57fa\u672c\u7684\u6d41\u7a0b\u3002<\/p>\n<p>\u6211\u4eec\u6700\u540e\u518d\u770b\u4e00\u770b\u5185\u6838\u51fd\u6570\uff0c\u662f\u8fd9\u6837\u5b50\u7684<\/p>\n<p>[code language=&#8221;cpp&#8221;]<\/p>\n<p><strong>global<\/strong> void addKernel(int *c, const int *a, const int *b)<br \/>\n{<br \/>\n    int i = threadIdx.x;<br \/>\n    c[i] = a[i] + b[i];<br \/>\n}<br \/>\n[\/code]<\/p>\n<p>\u8fd9\u4e2a\u51fd\u6570\u76f8\u5f53\u7b80\u5355\uff0c\u8f93\u5165\u4e09\u4e2a\u6570\u7ec4\uff0c\u7b49\u7b49\uff0c\u90a3\u5e76\u884c\u5904\u7406\u7684\u65f6\u5019\u7ebf\u7a0b\u600e\u4e48\u77e5\u9053\u81ea\u5df1\u7684\u8eab\u4efd\u5462\uff1f\u5b83\u600e\u4e48\u77e5\u9053\u8be5\u52a0\u54ea\u4e00\u4e2a\uff1f\u6ca1\u95ee\u9898\u7684\uff0cthreadIdx\u8fd9\u4e2a\u53d8\u91cf\u5b58\u50a8\u4e86\u672c\u7ebf\u7a0b\u7684\u8eab\u4efd\u4fe1\u606f\uff0c\u5728\u8fd9\u91cc\u6211\u4eec\u8ba9\u7b2ci\u4e2a\u7ebf\u7a0b\u6765\u64cd\u4f5c\u5411\u91cf\u7684\u7b2ci\u4e2a\u5143\u7d20\uff0c\u5b9e\u73b0\u4e86\u5e76\u884c\u5904\u7406\u3002<\/p>\n<p>\u8fd8\u8bb0\u5f97\u524d\u9762\u63d0\u5230\u8fc7\u7684MAX_THREADS_PER_BLOCK\u548cMAX_BLOCK_DIM_X\/Y\/Z\u5417\uff1f\u770b\u540d\u5b57\u5c31\u77e5\u9053\u4e86\uff0cMAX_THREADS_PER_BLOCK\u63cf\u8ff0\u4e86\u6bcf\u4e2aBlock\u5185\u6700\u591a\u6709\u591a\u5c11\u7ebf\u7a0b\uff0c\u5c3d\u7ba1\u4f60\u5728\u5206\u914d\u65f6\u53ef\u4ee5\u4f7f\u7528dim3\u628a\u7ebf\u7a0b\u5206\u914d\u6210\u4e09\u4e2a\u7ef4\u5ea6\uff0c\u4f46\u603b\u5171\u4e0d\u80fd\u8d85\u8fc7\u90a3\u4e2a\u4e0a\u9650\u3002\u5982\u679c\u4f60\u53ea\u662f\u5728\u81ea\u5df1\u7684\u673a\u5b50\u4e0a\u7b97\u70b9\u4e1c\u897f\uff0c\u76f4\u63a5\u67e5\u51fa\u6765\u5199\u4e0a\u5c31\u597d\u4e86\uff1b\u5982\u679c\u4f60\u8fd8\u60f3\u5728\u522b\u7684\u673a\u5b50\u4e0a\u8fd0\u884c\uff0c\u90a3\u4e48\u4f60\u6700\u597d\u7528cudaGetDeviceProperties\u8fd9\u4e2a\u51fd\u6570\u6765\u5f97\u5230\u5f53\u524d\u8bbe\u5907\u7684\u4fe1\u606f\u3002<\/p>\n<p>\u5047\u5982\u6211\u7684\u7ebf\u7a0b\u8d85\u8fc7\u8fd9\u4e2a\u6570\u600e\u4e48\u529e\uff1f<\/p>\n<p>\u6ca1\u95ee\u9898\uff0c\u7528\u524d\u9762\u7684block\u53c2\u6570\uff0c\u7136\u540e\u5728\u5185\u6838\u51fd\u6570\u4e2d\u7528blockIdx\u6765\u83b7\u53d6\u81ea\u5df1\u7684\u8eab\u4efd\u3002\u4e0e\u7ebf\u7a0b\u6570\u9650\u5236\u4e0d\u592a\u4e00\u6837\uff0cblock\u7684\u9650\u5236\u662f\u5206\u7ef4\u5ea6\u7684\uff0c\u5b58\u50a8\u5728MAX_BLOCK_DIM_X\/Y\/Z\u91cc\u9762\u3002<del>\u4e0d\u8fc7\u4e0d\u540cblock\u4f3c\u4e4e\u4f1a\u5206\u914d\u5728\u4e0d\u540c\u7684\u5927\u6838\u4e0a\uff0c\u65e0\u6cd5\u5171\u4eab\u5185\u5b58\uff0c\u4e5f\u65e0\u6cd5\u8fdb\u884c\u540c\u6b65\uff0c\u4f46\u53ef\u4ee5\u8fdb\u884c\u539f\u5b50\u64cd\u4f5c\u3002<\/del><\/p>\n<p>block\u4e0ethread\u7684\u7ec4\u7ec7\u60c5\u51b5\u5982\u4e0b\u56fe\u6240\u793a\uff1a<\/p>\n<p><img decoding=\"async\" src=\"http:\/\/img.blog.csdn.net\/20130723220559500?watermark\/2\/text\/aHR0cDovL2Jsb2cuY3Nkbi5uZXQva2trNTg0NTIw\/font\/5a6L5L2T\/fontsize\/400\/fill\/I0JBQkFCMA==\/dissolve\/70\/gravity\/Center\" alt=\"bubuko.com,\u5e03\u5e03\u6263\" \/><\/p>\n<p>\u597d\u4e86\uff0c\u73b0\u5728\u6765\u70b9\u5b9e\u7528\u70b9\u7684\u4e1c\u897f\u5427\u3002\u4e0b\u9762\u8fd9\u4e2a\u7a0b\u5e8f\u5b9e\u73b0\u4e86\u5747\u503c\u6ee4\u6ce2\uff0c\u6bcf\u4e2a\u7ebf\u7a0b\u5904\u7406\u4e00\u4e2a\u70b9\uff0c\u56fe\u50cf\u7684\u8f7d\u5165\u548c\u663e\u793a\u7531OpenCV\u6765\u5904\u7406\u3002\u6ce8\u610fblock\u4e0ethread\u7684\u5206\u914d\uff0c\u4ee5\u53ca\u8d8a\u754c\u7684\u5904\u7406\u3002\u53e6\u5916\u4e3a\u4e86\u56fe\u7701\u4e8b\uff0c\u6211\u6ca1\u505a\u9519\u8bef\u5904\u7406\u4ec0\u4e48\u7684\uff0c\u76f4\u63a5\u62ff\u53bb\u7528\u641e\u84dd\u5c4f\u4e86\u522b\u6765\u627e\u6211\u2026\u2026<\/p>\n<p>[code language=&#8221;cpp&#8221;]<\/p>\n<p>#include &quot;cuda_runtime.h&quot;<br \/>\n#include &quot;device_launch_parameters.h&quot;<br \/>\n#include &lt;stdio.h&gt;<br \/>\n#include &lt;opencv2\\opencv.hpp&gt;<\/p>\n<p>using namespace cv;<\/p>\n<p><strong>global<\/strong> void cudaBlur(uchar <em>src, uchar *dst, long long start, int rows, int cols, int r=1) {<br \/>\n    \/\/\u6ce8\u610f\u8fd9\u4e2a\u4f20\u5165\u7684start\u6ca1\u7528\uff0c\u6700\u5f00\u59cb\u662f\u56e0\u4e3a\u7528for\u5faa\u73af\u6267\u884c\uff0c\u6bcf\u6b211\u4e2a\u57571024\u4e2a\u7ebf\u7a0b\uff0c<br \/>\n    \/\/\u6240\u4ee5\u8981\u544a\u8bc9\u5185\u6838\u51fd\u6570\u6bcf\u6b21\u4ece\u54ea\u4e2a\u4f4d\u7f6e\u5f00\u59cb\u8ba1\u7b97\uff0c\u73b0\u5728\u76f4\u63a5\u7528blockIdx\u6765\u83b7\u5f97\u8eab\u4efd\u4fe1\u606f\u3002<br \/>\n    start = (long long)blockIdx.y * 1024 * 1024 + blockIdx.x * 1024;<br \/>\n    if (start + threadIdx.x &gt; rows<\/em>cols) return;<br \/>\n    int x = (start + threadIdx.x) \/ cols;<br \/>\n    int y = (start + threadIdx.x) % cols;<br \/>\n    int sums = 0;<br \/>\n    int count = 0;<br \/>\n    for (int i = x &#8211; r; i &lt;= x + r; ++i)<br \/>\n        for (int j = y &#8211; r; j &lt;= y + r; ++j) {<br \/>\n            if (i &lt; 0)<br \/>\n                break;<br \/>\n            if (j &lt; 0) continue; if (j &gt;= cols)<br \/>\n                break;<br \/>\n            if (i &gt;= rows)<br \/>\n                break;<br \/>\n            ++count;<br \/>\n            sums += src[i<em>cols + j];<br \/>\n        }<br \/>\n    dst[x<\/em>cols + y] = sums \/ count;<br \/>\n}<\/p>\n<p>Mat RAW;<br \/>\nMat result;<br \/>\nuchar* devRAW = nullptr;<br \/>\nuchar* devResult = nullptr;<\/p>\n<p>void fn(int , void<em>) {<br \/>\n    int window = getTrackbarPos(&quot;r&quot;, &quot;result&quot;);<br \/>\n    Mutex mu;<br \/>\n    mu.lock();<br \/>\n    \/\/\u5757\u6570\u592a\u591a\uff0c\u5fc5\u987b\u8981\u4e8c\u4f4d\u5e03\u5c40\u4e86<br \/>\n    cudaBlur &lt;&lt;&lt; dim3(1024, long long(RAW.rows)<\/em>RAW.cols \/ 1024 \/ 1024 + 1, 1), 1024 &gt;&gt;&gt;(devRAW, devResult, 0, RAW.rows, RAW.cols,window);<br \/>\n    cudaDeviceSynchronize();<br \/>\n    for (int i = 0; i &lt; RAW.rows; ++i) {<br \/>\n        cudaMemcpy((void<em>)result.ptr(i), (void<\/em>)(devResult + i*RAW.cols), RAW.cols, cudaMemcpyDeviceToHost);<br \/>\n    }<br \/>\n    mu.unlock();<br \/>\n    imshow(&quot;result&quot;, result);<br \/>\n}<\/p>\n<p>int main()<br \/>\n{<br \/>\n    RAW = imread(&quot;Hosho.jpg&quot;,CV_LOAD_IMAGE_GRAYSCALE);<br \/>\n    cudaSetDevice(0);<br \/>\n    cudaMalloc&lt;uchar&gt;(&amp;devRAW, RAW.rows<em>RAW.cols);<br \/>\n    cudaMalloc&lt;uchar&gt;(&amp;devResult, RAW.rows<\/em>RAW.cols);<br \/>\n    for (int i = 0; i &lt; RAW.rows; ++i) {<br \/>\n        cudaMemcpy((void<em>)(devRAW+i<\/em>RAW.cols), (void*)RAW.ptr(i), RAW.cols, cudaMemcpyHostToDevice);<br \/>\n    }<br \/>\n    result = Mat(RAW.rows, RAW.cols, CV_8UC1);<br \/>\n    imshow(&quot;result&quot;, RAW);<br \/>\n    int a = 0;<br \/>\n    createTrackbar(&quot;r&quot;, &quot;result&quot;, &amp;a, 40, fn);<br \/>\n    waitKey(0);<br \/>\n    blur(RAW, result, Size(3, 3));<br \/>\n    cudaFree(devRAW);<br \/>\n    cudaFree(devResult);<br \/>\n    return 0;<br \/>\n}<\/p>\n<p>[\/code]<\/p>\n<p>\u597d\u7684\uff0c\u5c31\u662f\u8fd9\u6837\u5b50\u4e86\u3002\u73b0\u5728\u5df2\u7ecf\u5bf9CUDA\u6709\u4e86\u4e00\u70b9\u5f88\u80a4\u6d45\u7684\u8ba4\u8bc6\u4e86xD<\/p>\n","protected":false},"excerpt":{"rendered":"<p>\u4e45\u95fbCUDA\u5927\u540d\uff0c\u6070\u597d\u4eca\u5929\u8981\u5199\u6570\u5b57\u56fe\u50cf\u5904\u7406\u7684\u4f5c\u4e1a\uff0c\u4e0d\u59a8\u6765\u8bd5\u4e00\u8bd5\u3002 \u9996\u5148\u7ed9\u51fa\u6211\u770b\u7684\u4e00\u7bc7\u53c2\u8003\u6587\u7ae0\uff1ahttp:\/\/w &hellip; <\/p>\n<p class=\"link-more\"><a href=\"https:\/\/himmel.tech\/wp\/?p=22\" class=\"more-link\">\u7ee7\u7eed\u9605\u8bfb<span class=\"screen-reader-text\">\u201cCUDA\u521d\u63a2\u201d<\/span><\/a><\/p>\n","protected":false},"author":1,"featured_media":23,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"jetpack_post_was_ever_published":false,"_jetpack_newsletter_access":"","_jetpack_dont_email_post_to_subs":true,"_jetpack_newsletter_tier_id":0,"_jetpack_memberships_contains_paywalled_content":false,"_jetpack_memberships_contains_paid_content":false,"footnotes":"","jetpack_publicize_message":"","jetpack_publicize_feature_enabled":true,"jetpack_social_post_already_shared":false,"jetpack_social_options":{"image_generator_settings":{"template":"highway","default_image_id":0,"font":"","enabled":false},"version":2}},"categories":[1],"tags":[],"class_list":["post-22","post","type-post","status-publish","format-standard","has-post-thumbnail","hentry","category-uncategorized"],"jetpack_publicize_connections":[],"jetpack_featured_media_url":"https:\/\/himmel.tech\/wp\/wp-content\/uploads\/2016\/05\/NVIDIA-Developer-Full-width-sldier-CUDA8.jpg","jetpack_sharing_enabled":true,"jetpack_shortlink":"https:\/\/wp.me\/p83CFH-m","_links":{"self":[{"href":"https:\/\/himmel.tech\/wp\/index.php?rest_route=\/wp\/v2\/posts\/22","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/himmel.tech\/wp\/index.php?rest_route=\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/himmel.tech\/wp\/index.php?rest_route=\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/himmel.tech\/wp\/index.php?rest_route=\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/himmel.tech\/wp\/index.php?rest_route=%2Fwp%2Fv2%2Fcomments&post=22"}],"version-history":[{"count":13,"href":"https:\/\/himmel.tech\/wp\/index.php?rest_route=\/wp\/v2\/posts\/22\/revisions"}],"predecessor-version":[{"id":106,"href":"https:\/\/himmel.tech\/wp\/index.php?rest_route=\/wp\/v2\/posts\/22\/revisions\/106"}],"wp:featuredmedia":[{"embeddable":true,"href":"https:\/\/himmel.tech\/wp\/index.php?rest_route=\/wp\/v2\/media\/23"}],"wp:attachment":[{"href":"https:\/\/himmel.tech\/wp\/index.php?rest_route=%2Fwp%2Fv2%2Fmedia&parent=22"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/himmel.tech\/wp\/index.php?rest_route=%2Fwp%2Fv2%2Fcategories&post=22"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/himmel.tech\/wp\/index.php?rest_route=%2Fwp%2Fv2%2Ftags&post=22"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}