hdiff output

r32176/lj_potential.cu 2017-03-23 17:30:12.079611749 +0000 r32175/lj_potential.cu 2017-03-23 17:30:13.547631093 +0000
151:                 }151:                 }
152: 152: 
153:         __inline__ __device__153:         __inline__ __device__
154:                 double blockReduceSum(double val) {154:                 double blockReduceSum(double val) {
155:                         static __shared__ double shared[32]; // Shared mem for 32 partial sums155:                         static __shared__ double shared[32]; // Shared mem for 32 partial sums
156:                         int lane = threadIdx.x % warpSize;156:                         int lane = threadIdx.x % warpSize;
157:                         int wid = threadIdx.x / warpSize;157:                         int wid = threadIdx.x / warpSize;
158: 158: 
159:                         val = warpReduceSum(val);     // Each warp performs partial reduction159:                         val = warpReduceSum(val);     // Each warp performs partial reduction
160: 160: 
161:                         // Write reduced value to shared memory161:                         if (lane==0) shared[wid]=val; // Write reduced value to shared memory
162:                         if (lane==0) { 
163:                                 shared[wid]=val; 
164:                         } 
165: 162: 
166:                         __syncthreads();              // Wait for all partial reductions163:                         __syncthreads();              // Wait for all partial reductions
167: 164: 
168:                         //read from shared memory only if that warp existed165:                         //read from shared memory only if that warp existed
169:                         val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;166:                         val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
170: 167: 
171:                         //Final reduce within first warp168:                         if (wid==0) val = warpReduceSum(val); //Final reduce within first warp
172:                         if (wid==0) { 
173:                                 val = warpReduceSum(val); 
174:                         } 
175: 169: 
176:                         return val;170:                         return val;
177:                 }171:                 }
178: 172: 
179:         __global__ void deviceReduceKernel(const double *in, double* out, const int N) {173:         __global__ void deviceReduceKernel(const double *in, double* out, const int N) {
180:                 double sum = 0;174:                 double sum = 0;
181:                 //reduce multiple elements per thread.175:                 //reduce multiple elements per thread.
182:                 for (int i = blockIdx.x * blockDim.x + threadIdx.x;176:                 for (int i = blockIdx.x * blockDim.x + threadIdx.x;
183:                                 i < N;177:                                 i < N;
184:                                 i += blockDim.x * gridDim.x) {178:                                 i += blockDim.x * gridDim.x) {
204:                         int index = ((tid + roundedMaxSite) % roundedMaxSite);198:                         int index = ((tid + roundedMaxSite) % roundedMaxSite);
205:                         int thisAtom = tid/roundedMaxSite;199:                         int thisAtom = tid/roundedMaxSite;
206: 200: 
207:                         if (index < N) {201:                         if (index < N) {
208:                                 int var = index + thisAtom*N;202:                                 int var = index + thisAtom*N;
209:                                 elements.x = in[3*var+0];203:                                 elements.x = in[3*var+0];
210:                                 elements.y = in[3*var+1];204:                                 elements.y = in[3*var+1];
211:                                 elements.z = in[3*var+2];205:                                 elements.z = in[3*var+2];
212:                         }206:                         }
213: 207: 
214:                         elements.x = blockReduceSum(elements.x); 
215:                         __syncthreads();208:                         __syncthreads();
 209: 
 210:                         elements.x = blockReduceSum(elements.x);
216:                         elements.y = blockReduceSum(elements.y);211:                         elements.y = blockReduceSum(elements.y);
217:                         __syncthreads(); 
218:                         elements.z = blockReduceSum(elements.z);212:                         elements.z = blockReduceSum(elements.z);
219: 213: 
 214:                         __syncthreads();
 215: 
220:                         if (threadIdx.x==0) {216:                         if (threadIdx.x==0) {
221:                                 if (blocks == numDimensions/3) {217:                                 if (blocks == numDimensions/3) {
222:                                         d_gradf[3*thisAtom+0] = elements.x;218:                                         d_gradf[3*thisAtom+0] = elements.x;
223:                                         d_gradf[3*thisAtom+1] = elements.y;219:                                         d_gradf[3*thisAtom+1] = elements.y;
224:                                         d_gradf[3*thisAtom+2] = elements.z;220:                                         d_gradf[3*thisAtom+2] = elements.z;
225:                                 }221:                                 }
226:                                 else {222:                                 else {
227:                                         out[3*blockIdx.x+0] = elements.x;223:                                         out[3*blockIdx.x+0] = elements.x;
228:                                         out[3*blockIdx.x+1] = elements.y;224:                                         out[3*blockIdx.x+1] = elements.y;
229:                                         out[3*blockIdx.x+2] = elements.z;225:                                         out[3*blockIdx.x+2] = elements.z;


r32176/rigid_bodies.cu 2017-03-23 17:30:11.795608010 +0000 r32175/rigid_bodies.cu 2017-03-23 17:30:13.275627502 +0000
1360:                         int thisBody = tid/roundedMaxSite;1360:                         int thisBody = tid/roundedMaxSite;
1361:                         int currentBody = m_d_largeRigidIndices[thisBody];1361:                         int currentBody = m_d_largeRigidIndices[thisBody];
1362: 1362: 
1363:                         if (index < m_d_nRigidSitesPerBody[currentBody]) {1363:                         if (index < m_d_nRigidSitesPerBody[currentBody]) {
1364:                                 int myAtom = m_d_rigidGroups[index+(*m_d_rigidMaxSite)*currentBody];1364:                                 int myAtom = m_d_rigidGroups[index+(*m_d_rigidMaxSite)*currentBody];
1365:                                 elements.x = d_gk[3*myAtom-3];1365:                                 elements.x = d_gk[3*myAtom-3];
1366:                                 elements.y = d_gk[3*myAtom-2];1366:                                 elements.y = d_gk[3*myAtom-2];
1367:                                 elements.z = d_gk[3*myAtom-1];1367:                                 elements.z = d_gk[3*myAtom-1];
1368:                         }1368:                         }
1369: 1369: 
1370:                         elements.x = blockReduceSum(elements.x); 
1371:                         __syncthreads();1370:                         __syncthreads();
 1371: 
 1372:                         elements.x = blockReduceSum(elements.x);
1372:                         elements.y = blockReduceSum(elements.y);1373:                         elements.y = blockReduceSum(elements.y);
1373:                         __syncthreads(); 
1374:                         elements.z = blockReduceSum(elements.z);1374:                         elements.z = blockReduceSum(elements.z);
1375: 1375: 
 1376:                         __syncthreads();
 1377: 
1376:                         if (threadIdx.x==0) {1378:                         if (threadIdx.x==0) {
1377:                                 d_outArray[3*blockIdx.x+0] = elements.x;1379:                                 d_outArray[3*blockIdx.x+0] = elements.x;
1378:                                 d_outArray[3*blockIdx.x+1] = elements.y;1380:                                 d_outArray[3*blockIdx.x+1] = elements.y;
1379:                                 d_outArray[3*blockIdx.x+2] = elements.z;1381:                                 d_outArray[3*blockIdx.x+2] = elements.z;
1380:                         }1382:                         }
1381: 1383: 
1382:                         tid += blockDim.x * gridDim.x;1384:                         tid += blockDim.x * gridDim.x;
1383:                 }1385:                 }
1384:         }1386:         }
1385: 1387: 
1399:                         int index = ((tid + roundedMaxSite) % roundedMaxSite);1401:                         int index = ((tid + roundedMaxSite) % roundedMaxSite);
1400:                         int thisBody = tid/roundedMaxSite;1402:                         int thisBody = tid/roundedMaxSite;
1401:                         int currentBody = m_d_largeRigidIndices[thisBody];1403:                         int currentBody = m_d_largeRigidIndices[thisBody];
1402: 1404: 
1403:                         if (index < m_d_nRigidSitesPerBody[currentBody]) {1405:                         if (index < m_d_nRigidSitesPerBody[currentBody]) {
1404:                                 elements.x = d_tempArray[0+3*index+3*(*m_d_rigidMaxSite)*currentBody];1406:                                 elements.x = d_tempArray[0+3*index+3*(*m_d_rigidMaxSite)*currentBody];
1405:                                 elements.y = d_tempArray[1+3*index+3*(*m_d_rigidMaxSite)*currentBody];1407:                                 elements.y = d_tempArray[1+3*index+3*(*m_d_rigidMaxSite)*currentBody];
1406:                                 elements.z = d_tempArray[2+3*index+3*(*m_d_rigidMaxSite)*currentBody];1408:                                 elements.z = d_tempArray[2+3*index+3*(*m_d_rigidMaxSite)*currentBody];
1407:                         }1409:                         }
1408: 1410: 
1409:                         elements.x = blockReduceSum(elements.x); 
1410:                         __syncthreads();1411:                         __syncthreads();
 1412: 
 1413:                         elements.x = blockReduceSum(elements.x);
1411:                         elements.y = blockReduceSum(elements.y);1414:                         elements.y = blockReduceSum(elements.y);
1412:                         __syncthreads(); 
1413:                         elements.z = blockReduceSum(elements.z);1415:                         elements.z = blockReduceSum(elements.z);
1414: 1416: 
 1417:                         __syncthreads();
 1418: 
1415:                         if (threadIdx.x==0) {1419:                         if (threadIdx.x==0) {
1416:                                 d_outArray[3*blockIdx.x+0] = elements.x;1420:                                 d_outArray[3*blockIdx.x+0] = elements.x;
1417:                                 d_outArray[3*blockIdx.x+1] = elements.y;1421:                                 d_outArray[3*blockIdx.x+1] = elements.y;
1418:                                 d_outArray[3*blockIdx.x+2] = elements.z;1422:                                 d_outArray[3*blockIdx.x+2] = elements.z;
1419:                         }1423:                         }
1420: 1424: 
1421:                         tid += blockDim.x * gridDim.x;1425:                         tid += blockDim.x * gridDim.x;
1422:                 }1426:                 }
1423:         }1427:         }
1424: 1428: 
1439:                         int currentBody = m_d_largeRigidIndices[thisBody];1443:                         int currentBody = m_d_largeRigidIndices[thisBody];
1440: 1444: 
1441:                         int sectionSize = outSize/(*m_d_nLargeRigidBody);1445:                         int sectionSize = outSize/(*m_d_nLargeRigidBody);
1442:                         if (index < sectionSize) {1446:                         if (index < sectionSize) {
1443:                                 int var = index+thisBody*sectionSize;1447:                                 int var = index+thisBody*sectionSize;
1444:                                 elements.x = d_outArray[3*var+0];1448:                                 elements.x = d_outArray[3*var+0];
1445:                                 elements.y = d_outArray[3*var+1];1449:                                 elements.y = d_outArray[3*var+1];
1446:                                 elements.z = d_outArray[3*var+2];1450:                                 elements.z = d_outArray[3*var+2];
1447:                         }1451:                         }
1448: 1452: 
1449:                         elements.x = blockReduceSum(elements.x); 
1450:                         __syncthreads();1453:                         __syncthreads();
 1454: 
 1455:                         elements.x = blockReduceSum(elements.x);
1451:                         elements.y = blockReduceSum(elements.y);1456:                         elements.y = blockReduceSum(elements.y);
1452:                         __syncthreads(); 
1453:                         elements.z = blockReduceSum(elements.z);1457:                         elements.z = blockReduceSum(elements.z);
1454: 1458: 
 1459:                         __syncthreads();
 1460: 
1455:                         if (threadIdx.x==0) {1461:                         if (threadIdx.x==0) {
1456:                                 if (blocks == (*m_d_nLargeRigidBody)) {1462:                                 if (blocks == (*m_d_nLargeRigidBody)) {
1457:                                         m_d_gkRigid[3*currentBody] = elements.x;1463:                                         m_d_gkRigid[3*currentBody] = elements.x;
1458:                                         m_d_gkRigid[3*currentBody+1] = elements.y;1464:                                         m_d_gkRigid[3*currentBody+1] = elements.y;
1459:                                         m_d_gkRigid[3*currentBody+2] = elements.z;1465:                                         m_d_gkRigid[3*currentBody+2] = elements.z;
1460:                                 }1466:                                 }
1461:                                 else {1467:                                 else {
1462:                                         d_outArray[3*blockIdx.x+0] = elements.x;1468:                                         d_outArray[3*blockIdx.x+0] = elements.x;
1463:                                         d_outArray[3*blockIdx.x+1] = elements.y;1469:                                         d_outArray[3*blockIdx.x+1] = elements.y;
1464:                                         d_outArray[3*blockIdx.x+2] = elements.z;1470:                                         d_outArray[3*blockIdx.x+2] = elements.z;
1486:                         int currentBody = m_d_largeRigidIndices[thisBody];1492:                         int currentBody = m_d_largeRigidIndices[thisBody];
1487: 1493: 
1488:                         int sectionSize = outSize/(*m_d_nLargeRigidBody);1494:                         int sectionSize = outSize/(*m_d_nLargeRigidBody);
1489:                         if (index < sectionSize) {1495:                         if (index < sectionSize) {
1490:                                 int var = index+thisBody*sectionSize;1496:                                 int var = index+thisBody*sectionSize;
1491:                                 elements.x = d_outArray[3*var+0];1497:                                 elements.x = d_outArray[3*var+0];
1492:                                 elements.y = d_outArray[3*var+1];1498:                                 elements.y = d_outArray[3*var+1];
1493:                                 elements.z = d_outArray[3*var+2];1499:                                 elements.z = d_outArray[3*var+2];
1494:                         }1500:                         }
1495: 1501: 
1496:                         elements.x = blockReduceSum(elements.x); 
1497:                         __syncthreads();1502:                         __syncthreads();
 1503: 
 1504:                         elements.x = blockReduceSum(elements.x);
1498:                         elements.y = blockReduceSum(elements.y);1505:                         elements.y = blockReduceSum(elements.y);
1499:                         __syncthreads(); 
1500:                         elements.z = blockReduceSum(elements.z);1506:                         elements.z = blockReduceSum(elements.z);
1501: 1507: 
 1508:                         __syncthreads();
 1509: 
1502:                         if (threadIdx.x==0) {1510:                         if (threadIdx.x==0) {
1503:                                 if (blocks == (*m_d_nLargeRigidBody)) {1511:                                 if (blocks == (*m_d_nLargeRigidBody)) {
1504:                                         m_d_gkRigid[3*(*m_d_nRigidBody)+3*currentBody] = elements.x;1512:                                         m_d_gkRigid[3*(*m_d_nRigidBody)+3*currentBody] = elements.x;
1505:                                         m_d_gkRigid[3*(*m_d_nRigidBody)+3*currentBody+1] = elements.y;1513:                                         m_d_gkRigid[3*(*m_d_nRigidBody)+3*currentBody+1] = elements.y;
1506:                                         m_d_gkRigid[3*(*m_d_nRigidBody)+3*currentBody+2] = elements.z;1514:                                         m_d_gkRigid[3*(*m_d_nRigidBody)+3*currentBody+2] = elements.z;
1507:                                 }1515:                                 }
1508:                                 else {1516:                                 else {
1509:                                         d_outArray[3*blockIdx.x+0] = elements.x;1517:                                         d_outArray[3*blockIdx.x+0] = elements.x;
1510:                                         d_outArray[3*blockIdx.x+1] = elements.y;1518:                                         d_outArray[3*blockIdx.x+1] = elements.y;
1511:                                         d_outArray[3*blockIdx.x+2] = elements.z;1519:                                         d_outArray[3*blockIdx.x+2] = elements.z;
1534:                         int currentBody = m_d_largeRigidIndices[thisBody];1542:                         int currentBody = m_d_largeRigidIndices[thisBody];
1535: 1543: 
1536:                         int sectionSize = outSize/(*m_d_nLargeRigidBody);1544:                         int sectionSize = outSize/(*m_d_nLargeRigidBody);
1537:                         if (index < sectionSize) {1545:                         if (index < sectionSize) {
1538:                                 int var = index+thisBody*sectionSize;1546:                                 int var = index+thisBody*sectionSize;
1539:                                 elements.x = d_outArray[3*var+0];1547:                                 elements.x = d_outArray[3*var+0];
1540:                                 elements.y = d_outArray[3*var+1];1548:                                 elements.y = d_outArray[3*var+1];
1541:                                 elements.z = d_outArray[3*var+2];1549:                                 elements.z = d_outArray[3*var+2];
1542:                         }1550:                         }
1543: 1551: 
1544:                         elements.x = blockReduceSum(elements.x); 
1545:                         __syncthreads();1552:                         __syncthreads();
 1553: 
 1554:                         elements.x = blockReduceSum(elements.x);
1546:                         elements.y = blockReduceSum(elements.y);1555:                         elements.y = blockReduceSum(elements.y);
1547:                         __syncthreads(); 
1548:                         elements.z = blockReduceSum(elements.z);1556:                         elements.z = blockReduceSum(elements.z);
1549: 1557: 
 1558:                         __syncthreads();
 1559: 
1550:                         if (threadIdx.x==0) {1560:                         if (threadIdx.x==0) {
1551:                                 if (blocks == (*m_d_nLargeRigidBody)) {1561:                                 if (blocks == (*m_d_nLargeRigidBody)) {
1552:                                         d_torques[3*currentBody] = elements.x;1562:                                         d_torques[3*currentBody] = elements.x;
1553:                                         d_torques[3*currentBody+1] = elements.y;1563:                                         d_torques[3*currentBody+1] = elements.y;
1554:                                         d_torques[3*currentBody+2] = elements.z;1564:                                         d_torques[3*currentBody+2] = elements.z;
1555:                                 }1565:                                 }
1556:                                 else {1566:                                 else {
1557:                                         d_outArray[3*blockIdx.x+0] = elements.x;1567:                                         d_outArray[3*blockIdx.x+0] = elements.x;
1558:                                         d_outArray[3*blockIdx.x+1] = elements.y;1568:                                         d_outArray[3*blockIdx.x+1] = elements.y;
1559:                                         d_outArray[3*blockIdx.x+2] = elements.z;1569:                                         d_outArray[3*blockIdx.x+2] = elements.z;


legend
Lines Added 
Lines changed
 Lines Removed

hdiff - version: 2.1.0