hdiff output

r33294/CMakeLists.txt 2017-09-08 11:30:14.973098277 +0100 r33293/CMakeLists.txt 2017-09-08 11:30:16.657120753 +0100
 12:  12: 
 13: # svn root directory should be one directory above this directory 13: # svn root directory should be one directory above this directory
 14: get_filename_component(SVN_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/.. ABSOLUTE) 14: get_filename_component(SVN_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/.. ABSOLUTE)
 15:  15: 
 16: # CUDA compilation 16: # CUDA compilation
 17: find_package(CUDA) 17: find_package(CUDA)
 18: # Set nvcc flags 18: # Set nvcc flags
 19: # These use optimisations and build for CUDA 3.5 virtual and real architectures 19: # These use optimisations and build for CUDA 3.5 virtual and real architectures
 20: # See http://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#gpu-compilation  20: # See http://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#gpu-compilation 
 21: # for more details. 21: # for more details.
 22: set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-O3;-arch=compute_35;-code=sm_35) 22: set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-O3;-arch=compute_35;-code=sm_35;-use_fast_math)
 23:  23: 
 24: # Make sure use_DPDP flag is set for AMBER 24: # Make sure use_DPDP flag is set for AMBER
 25: add_definitions(-Duse_DPDP) 25: add_definitions(-Duse_DPDP)
 26:  26: 
 27: # Set source and include directories 27: # Set source and include directories
 28: file(GLOB CUDA_SOURCES "${CUDAINTERFACE_SOURCE_DIR}/bfgsts/*.cu" "${CUDAINTERFACE_SOURCE_DIR}/cost_function/*.cu" "${CUDAINTERFACE_SOURCE_DIR}/lbfgs/*.cu" "${CUDAINTERFACE_SOURCE_DIR}/potential/*.cu" "${CUDAINTERFACE_SOURCE_DIR}/utils/*.cu") 28: file(GLOB CUDA_SOURCES "${CUDAINTERFACE_SOURCE_DIR}/bfgsts/*.cu" "${CUDAINTERFACE_SOURCE_DIR}/cost_function/*.cu" "${CUDAINTERFACE_SOURCE_DIR}/lbfgs/*.cu" "${CUDAINTERFACE_SOURCE_DIR}/potential/*.cu" "${CUDAINTERFACE_SOURCE_DIR}/utils/*.cu")
 29: set(INCLUDE_DIRS "${CUDAINTERFACE_SOURCE_DIR}/include" "${CUDAINTERFACE_SOURCE_DIR}/lbfgs" "${SVN_ROOT}/AMBER12/cuda") 29: set(INCLUDE_DIRS "${CUDAINTERFACE_SOURCE_DIR}/include" "${CUDAINTERFACE_SOURCE_DIR}/lbfgs" "${SVN_ROOT}/AMBER12/cuda")
 30:  30: 
 31: # Create library 31: # Create library
 32: cuda_include_directories(${INCLUDE_DIRS}) 32: cuda_include_directories(${INCLUDE_DIRS})


r33294/lbfgs.cu 2017-09-08 11:30:15.413104150 +0100 r33293/lbfgs.cu 2017-09-08 11:30:17.097126625 +0100
360:                 update3<<<1, 1>>>(m_d_rho + (m_cumulativeIter % m_updates), m_d_H0, d_tmp, d_tmp2);360:                 update3<<<1, 1>>>(m_d_rho + (m_cumulativeIter % m_updates), m_d_H0, d_tmp, d_tmp2);
361:                 CudaCheckError();361:                 CudaCheckError();
362:                 cudaDeviceSynchronize();362:                 cudaDeviceSynchronize();
363: 363: 
364:                 if (isTimingUpdates) {364:                 if (isTimingUpdates) {
365:                         m_timer_updates.stop();365:                         m_timer_updates.stop();
366:                 }366:                 }
367: 367: 
368:                 ++m_cumulativeIter;368:                 ++m_cumulativeIter;
369:         }369:         }
 370:         //        CudaSafeCall( cudaMemcpy(&printEne, d_fk, sizeof(double), cudaMemcpyDeviceToHost) );
370: 371: 
371:         *itDone = it;372:         *itDone = it;
372: 373: 
373:         if (isRayleighRitz) {374:         if (isRayleighRitz) {
374:                 // Normalise. 375:                 // Normalise. 
375:                 double thisFactor;376:                 double thisFactor;
376:                 m_cublas.dispatchNrm2(numDimensions, &thisFactor, d_x, false); // thisFactor = sqrt(x Dot x)377:                 m_cublas.dispatchNrm2(numDimensions, &thisFactor, d_x, false); // thisFactor = sqrt(x Dot x)
377:                 thisFactor = 1.0/thisFactor;378:                 thisFactor = 1.0/thisFactor;
378:                 m_cublas.dispatchScale(numDimensions, d_x, d_x, &thisFactor, false); // x = thisfactor*x379:                 m_cublas.dispatchScale(numDimensions, d_x, d_x, &thisFactor, false); // x = thisfactor*x
379:         }380:         }


r33294/lj_potential.cu 2017-09-08 11:30:15.633107087 +0100 r33293/lj_potential.cu 2017-09-08 11:30:17.313129508 +0100
240:                 // Each thread deals with one atom interacting with one other atom.240:                 // Each thread deals with one atom interacting with one other atom.
241: 241: 
242:                 int tid = blockIdx.x * blockDim.x + threadIdx.x;242:                 int tid = blockIdx.x * blockDim.x + threadIdx.x;
243: 243: 
244:                 while (tid < (numDimensions/3)*(numDimensions/3)) {244:                 while (tid < (numDimensions/3)*(numDimensions/3)) {
245: 245: 
246:                         int refAtom = tid / (numDimensions/3); // Integer division rounds down.246:                         int refAtom = tid / (numDimensions/3); // Integer division rounds down.
247:                         int myAtom = tid % (numDimensions/3);247:                         int myAtom = tid % (numDimensions/3);
248: 248: 
249:                         // Read the coordinates from global memory into memory local to each thread. 249:                         // Read the coordinates from global memory into memory local to each thread. 
250:                         // Could reorder d_x to xxxyyyzzz pattern for coalesced access (in setup routine before L-BFGS) 
251:                         // Extra reads probably covered by L2 cache on Kepler though - would need to test 
252:                         double myPositionX = d_x[3*myAtom+0];250:                         double myPositionX = d_x[3*myAtom+0];
253:                         double myPositionY = d_x[3*myAtom+1];251:                         double myPositionY = d_x[3*myAtom+1];
254:                         double myPositionZ = d_x[3*myAtom+2];252:                         double myPositionZ = d_x[3*myAtom+2];
255: 253: 
256:                         double refPositionX = d_x[3*refAtom+0];254:                         double refPositionX = d_x[3*refAtom+0];
257:                         double refPositionY = d_x[3*refAtom+1];255:                         double refPositionY = d_x[3*refAtom+1];
258:                         double refPositionZ = d_x[3*refAtom+2];256:                         double refPositionZ = d_x[3*refAtom+2];
259: 257: 
260:                         int mj1 = refAtom % nAddTarget;258:                         int mj1 = refAtom % nAddTarget;
261:                         int mj2 = myAtom % nAddTarget;259:                         int mj2 = myAtom % nAddTarget;


r33294/modcudalbfgs.F90 2017-09-08 11:30:15.845109916 +0100 r33293/modcudalbfgs.F90 2017-09-08 11:30:17.533132444 +0100
 85: INTERFACE 85: INTERFACE
 86:     SUBROUTINE CUDA_ENEGRAD_CPUTOGPU(NATOMS, COORDS, C_TOTENERGY, C_GRADIENTS, C_NADDTARGET, C_LJADDREP, C_LJADDATT, C_CUDAPOT, &  86:     SUBROUTINE CUDA_ENEGRAD_CPUTOGPU(NATOMS, COORDS, C_TOTENERGY, C_GRADIENTS, C_NADDTARGET, C_LJADDREP, C_LJADDATT, C_CUDAPOT, & 
 87:                                     C_CUDATIMET, POTENTIALTIME) BIND(C,NAME="setup_potential_cputogpu") 87:                                     C_CUDATIMET, POTENTIALTIME) BIND(C,NAME="setup_potential_cputogpu")
 88:  88: 
 89:         IMPORT :: C_INT, C_DOUBLE, C_BOOL, C_CHAR 89:         IMPORT :: C_INT, C_DOUBLE, C_BOOL, C_CHAR
 90:  90: 
 91:         INTEGER(KIND=C_INT), INTENT(IN) :: NATOMS, & ! No. of atoms 91:         INTEGER(KIND=C_INT), INTENT(IN) :: NATOMS, & ! No. of atoms
 92:                                            C_NADDTARGET ! Target cluster size (addressability) 92:                                            C_NADDTARGET ! Target cluster size (addressability)
 93:  93: 
 94:         REAL(KIND=C_DOUBLE), DIMENSION(3*NATOMS), INTENT(IN) :: COORDS ! Atomic coordinates 94:         REAL(KIND=C_DOUBLE), DIMENSION(3*NATOMS), INTENT(IN) :: COORDS ! Atomic coordinates
 95:         ! Need Fortran compiler support for Technical Specification 29113 to pass allocatable arrays across ISO_C_BINDING interface 
 96:         ! Currently only available for very newest compilers, but this should be done in future for LJADDREP/LJADATT so that large  
 97:         ! arrays reside on the heap rather than the stack 
 98:         REAL(KIND=C_DOUBLE), DIMENSION(C_NADDTARGET, C_NADDTARGET), INTENT(IN) ::C_LJADDREP, C_LJADDATT ! Repulsive/attractive epsilon matrix 95:         REAL(KIND=C_DOUBLE), DIMENSION(C_NADDTARGET, C_NADDTARGET), INTENT(IN) ::C_LJADDREP, C_LJADDATT ! Repulsive/attractive epsilon matrix
 99:         REAL(KIND=C_DOUBLE), INTENT(OUT) :: C_TOTENERGY, & ! Total energy of the system 96:         REAL(KIND=C_DOUBLE), INTENT(OUT) :: C_TOTENERGY, & ! Total energy of the system
100:                                             POTENTIALTIME ! Time taken in calculating potential - not used in GMIN 97:                                             POTENTIALTIME ! Time taken in calculating potential - not used in GMIN
101:         REAL(KIND=C_DOUBLE), DIMENSION(3*NATOMS), INTENT(OUT) :: C_GRADIENTS ! Gradient of the energy w.r.t. each atomic coordinate 98:         REAL(KIND=C_DOUBLE), DIMENSION(3*NATOMS), INTENT(OUT) :: C_GRADIENTS ! Gradient of the energy w.r.t. each atomic coordinate
102:  99: 
103:         LOGICAL(KIND=C_BOOL), INTENT(IN) :: C_CUDATIMET ! If true, print timing info100:         LOGICAL(KIND=C_BOOL), INTENT(IN) :: C_CUDATIMET ! If true, print timing info
104: 101: 
105:         CHARACTER(LEN=1, KIND=C_CHAR), INTENT(IN) :: C_CUDAPOT ! Character specifying the CUDA potential to be used102:         CHARACTER(LEN=1, KIND=C_CHAR), INTENT(IN) :: C_CUDAPOT ! Character specifying the CUDA potential to be used
106: 103: 
107:     END SUBROUTINE CUDA_ENEGRAD_CPUTOGPU    104:     END SUBROUTINE CUDA_ENEGRAD_CPUTOGPU    
349: 346: 
350:         LOGICAL(KIND=C_BOOL) :: C_CUDATIMET347:         LOGICAL(KIND=C_BOOL) :: C_CUDATIMET
351: 348: 
352:         CHARACTER(LEN=1, KIND=C_CHAR) :: C_CUDAPOT349:         CHARACTER(LEN=1, KIND=C_CHAR) :: C_CUDAPOT
353: 350: 
354:         DOUBLE PRECISION    :: HESSIAN(3*NATOMS, 3*NATOMS)351:         DOUBLE PRECISION    :: HESSIAN(3*NATOMS, 3*NATOMS)
355:         DOUBLE PRECISION    :: DELTA352:         DOUBLE PRECISION    :: DELTA
356:         DOUBLE PRECISION    :: GRAD_PLUS(3*NATOMS), GRAD_MINUS(3*NATOMS)353:         DOUBLE PRECISION    :: GRAD_PLUS(3*NATOMS), GRAD_MINUS(3*NATOMS)
357:         INTEGER             :: I, J354:         INTEGER             :: I, J
358: 355: 
359:         DOUBLE PRECISION  :: MYTSTART, MYTFINISH 
360:  
361: #ifndef DUMMY_CUDA356: #ifndef DUMMY_CUDA
362:         IF (ALLOCATED(LJADDREP) .AND. ALLOCATED(LJADDATT) .AND. LJADD3T) THEN357:         IF (ALLOCATED(LJADDREP) .AND. ALLOCATED(LJADDATT) .AND. LJADD3T) THEN
363:             DO J = 1,NADDTARGET358:             DO J = 1,NADDTARGET
364:                 DO I = 1,NADDTARGET359:                 DO I = 1,NADDTARGET
365:                     C_LJADDREP((J - 1)*NADDTARGET + I) = LJADDREP(I,J)360:                     C_LJADDREP((J - 1)*NADDTARGET + I) = LJADDREP(I,J)
366:                     C_LJADDATT((J - 1)*NADDTARGET + I) = LJADDATT(I,J)361:                     C_LJADDATT((J - 1)*NADDTARGET + I) = LJADDATT(I,J)
367:                 END DO362:                 END DO
368:             END DO363:             END DO
369:         ELSE364:         ELSE
370:             C_LJADDREP(:) = 1.0365:             C_LJADDREP(:) = 1.0
371:             C_LJADDATT(:) = 1.0366:             C_LJADDATT(:) = 1.0
372:         END IF367:         END IF
373: 368: 
374:         C_NADDTARGET = NADDTARGET369:         C_NADDTARGET = NADDTARGET
375:         C_CUDAPOT = CUDAPOT370:         C_CUDAPOT = CUDAPOT
376:         C_CUDATIMET = CUDATIMET371:         C_CUDATIMET = CUDATIMET
377: 372: 
378:         CALL CPU_TIME(MYTSTART) 
379:         DO I = 1, 3*NATOMS373:         DO I = 1, 3*NATOMS
380:             ! Plus374:             ! Plus
381:             COORDS(I) = COORDS(I) + DELTA375:             COORDS(I) = COORDS(I) + DELTA
382:             CALL CUDA_ENEGRAD_CPUTOGPU(3*NATOMS, COORDS, C_ENERGY, C_GRADIENTS, C_NADDTARGET, C_LJADDREP, C_LJADDATT, C_CUDAPOT, & 376:             CALL CUDA_ENEGRAD_CPUTOGPU(3*NATOMS, COORDS, C_ENERGY, C_GRADIENTS, C_NADDTARGET, C_LJADDREP, C_LJADDATT, C_CUDAPOT, & 
383:                                       C_CUDATIMET, POTENTIALTIME)377:                                       C_CUDATIMET, POTENTIALTIME)
384:             NPCALL = NPCALL + 1 
385:             GRAD_PLUS(:) = DBLE(C_GRADIENTS(:))378:             GRAD_PLUS(:) = DBLE(C_GRADIENTS(:))
386:             ! Minus379:             ! Minus
387:             COORDS(I) = COORDS(I) - 2.0D0 * DELTA380:             COORDS(I) = COORDS(I) - 2.0D0 * DELTA
388:             CALL CUDA_ENEGRAD_CPUTOGPU(3*NATOMS, COORDS, C_ENERGY, C_GRADIENTS, C_NADDTARGET, C_LJADDREP, C_LJADDATT, C_CUDAPOT, & 381:             CALL CUDA_ENEGRAD_CPUTOGPU(3*NATOMS, COORDS, C_ENERGY, C_GRADIENTS, C_NADDTARGET, C_LJADDREP, C_LJADDATT, C_CUDAPOT, & 
389:                                       C_CUDATIMET, POTENTIALTIME)382:                                       C_CUDATIMET, POTENTIALTIME)
390:             NPCALL = NPCALL + 1 
391:             GRAD_MINUS(:) = DBLE(C_GRADIENTS(:))383:             GRAD_MINUS(:) = DBLE(C_GRADIENTS(:))
392:             ! Reset coords384:             ! Reset coords
393:             COORDS(I) = COORDS(I) + DELTA385:             COORDS(I) = COORDS(I) + DELTA
394:             ! Calculate hessian386:             ! Calculate hessian
395:             HESSIAN(I, :) = (GRAD_PLUS(:) - GRAD_MINUS(:)) / (2.0D0 * DELTA)387:             HESSIAN(I, :) = (GRAD_PLUS(:) - GRAD_MINUS(:)) / (2.0D0 * DELTA)
396:         END DO388:         END DO
397:         CALL MYCPU_TIME(MYTFINISH,.FALSE.) 
398:         MYTFINISH=MYTFINISH-MYTSTART 
399:         WRITE(MYUNIT,'(A,F15.2)') ' Numerical hessian elapsed time = ',MYTFINISH 
400: #endif389: #endif
401:     END SUBROUTINE CUDA_NUMERICAL_HESS390:     END SUBROUTINE CUDA_NUMERICAL_HESS
402: 391: 
403: END MODULE MODCUDALBFGS392: END MODULE MODCUDALBFGS


r33294/potential.f 2017-09-08 11:30:16.441117870 +0100 r33293/potential.f 2017-09-08 11:30:17.977138369 +0100
3354:                CALL AMBER12_ENERGY_AND_GRADIENT(NATOMS,3354:                CALL AMBER12_ENERGY_AND_GRADIENT(NATOMS,
3355:      &                                          COORDS,3355:      &                                          COORDS,
3356:      &                                          ENERGY,3356:      &                                          ENERGY,
3357:      &                                          GRADATOMS,3357:      &                                          GRADATOMS,
3358:      &                                          ENERGY_DECOMP)3358:      &                                          ENERGY_DECOMP)
3359:             END IF3359:             END IF
3360:             VNEW(1:3*NATOMS) = GRADATOMS(:)3360:             VNEW(1:3*NATOMS) = GRADATOMS(:)
3361: ! Calculate the numerical hessian3361: ! Calculate the numerical hessian
3362:             IF (STEST) THEN3362:             IF (STEST) THEN
3363:                IF (.NOT. ALLOCATED(HESS)) ALLOCATE(HESS(3*NATOMS, 3*NATOMS))3363:                IF (.NOT. ALLOCATED(HESS)) ALLOCATE(HESS(3*NATOMS, 3*NATOMS))
3364:                CALL AMBER12_NUM_HESS(NATOMS, COORDS, DELTA=1.0D-4, HESSIAN=HESS(:, :))3364:                CALL AMBER12_NUM_HESS(NATOMS, COORDS, DELTA=1.0D-5, HESSIAN=HESS(:, :))
3365:             END IF3365:             END IF
3366:             IF (PTEST) THEN3366:             IF (PTEST) THEN
3367:                WRITE(*,10) ' potential> Energy for last cycle=',ENERGY,' kcal/mol'3367:                WRITE(*,10) ' potential> Energy for last cycle=',ENERGY,' kcal/mol'
3368:                WRITE(ESTRING,10) 'Energy for last cycle=',ENERGY,' kcal/mol'3368:                WRITE(ESTRING,10) 'Energy for last cycle=',ENERGY,' kcal/mol'
3369:             ENDIF3369:             ENDIF
3370:             IF (RIGIDINIT .AND. (.NOT. ATOMRIGIDCOORDT) ) THEN3370:             IF (RIGIDINIT .AND. (.NOT. ATOMRIGIDCOORDT) ) THEN
3371:                IF (STEST) THEN3371:                IF (STEST) THEN
3372:                   CALL TRANSFORMHESSIAN(HESS, GRADATOMS, XRIGIDCOORDS, XRIGIDHESS, RBAANORMALMODET)3372:                   CALL TRANSFORMHESSIAN(HESS, GRADATOMS, XRIGIDCOORDS, XRIGIDHESS, RBAANORMALMODET)
3373:                   HESS(DEGFREEDOMS+1:3*NATOMS,:) = 0.0D03373:                   HESS(DEGFREEDOMS+1:3*NATOMS,:) = 0.0D0
3374:                   HESS(:,DEGFREEDOMS+1:3*NATOMS) = 0.0D03374:                   HESS(:,DEGFREEDOMS+1:3*NATOMS) = 0.0D0


r33294/potential.f90 2017-09-08 11:30:16.165114187 +0100 r33293/potential.f90 2017-09-08 11:30:17.757135434 +0100
589:          GRAD(DEGFREEDOMS+1:3*NATOMS)=0.0D0589:          GRAD(DEGFREEDOMS+1:3*NATOMS)=0.0D0
590:       END IF590:       END IF
591: !copied from OPTIM, rbody part not tested591: !copied from OPTIM, rbody part not tested
592:       IF (SECT) THEN592:       IF (SECT) THEN
593:          IF (ALLOCATED(HESS)) DEALLOCATE(HESS)593:          IF (ALLOCATED(HESS)) DEALLOCATE(HESS)
594:          ALLOCATE(HESS(3*NATOMS, 3*NATOMS))594:          ALLOCATE(HESS(3*NATOMS, 3*NATOMS))
595:          IF (CUDAT) THEN595:          IF (CUDAT) THEN
596:             CALL CUDA_NUMERICAL_HESS(NATOMS, X, HESS, DELTA=1.0D-4)596:             CALL CUDA_NUMERICAL_HESS(NATOMS, X, HESS, DELTA=1.0D-4)
597:          ELSE597:          ELSE
598:             CALL AMBER12_NUM_HESS(NATOMS,X, DELTA=1.0D-4, HESSIAN=HESS(:, :))598:             CALL AMBER12_NUM_HESS(NATOMS,X, DELTA=1.0D-4, HESSIAN=HESS(:, :))
599:             NPCALL=NPCALL+4 
600:          END IF599:          END IF
601:          IF (RIGIDINIT .AND. (.NOT. ATOMRIGIDCOORDT) ) THEN600:          IF (RIGIDINIT .AND. (.NOT. ATOMRIGIDCOORDT) ) THEN
602:             CALL TRANSFORMHESSIAN(HESS, GRADATOMS, XRIGIDCOORDS,XRIGIDHESS, RBAANORMALMODET)601:             CALL TRANSFORMHESSIAN(HESS, GRADATOMS, XRIGIDCOORDS,XRIGIDHESS, RBAANORMALMODET)
603:             HESS(DEGFREEDOMS+1:3*NATOMS,:) = 0.0D0602:             HESS(DEGFREEDOMS+1:3*NATOMS,:) = 0.0D0
604:             HESS(:,DEGFREEDOMS+1:3*NATOMS) = 0.0D0603:             HESS(:,DEGFREEDOMS+1:3*NATOMS) = 0.0D0
605:             HESS(1:DEGFREEDOMS,1:DEGFREEDOMS) = XRIGIDHESS(1:DEGFREEDOMS,1:DEGFREEDOMS)604:             HESS(1:DEGFREEDOMS,1:DEGFREEDOMS) = XRIGIDHESS(1:DEGFREEDOMS,1:DEGFREEDOMS)
606:          END IF605:          END IF
607:       END IF606:       END IF
608: ! AMBER 9 Energy and gradient calls607: ! AMBER 9 Energy and gradient calls
609:    ELSE IF (AMBERT) THEN608:    ELSE IF (AMBERT) THEN


r33294/rigid_bodies.cu 2017-09-08 11:30:15.193101215 +0100 r33293/rigid_bodies.cu 2017-09-08 11:30:16.877123689 +0100
161:         CudaSafeCall( cudaMalloc(&d_grmi3, 9 * m_nRigidBody * sizeof(double)) );161:         CudaSafeCall( cudaMalloc(&d_grmi3, 9 * m_nRigidBody * sizeof(double)) );
162: 162: 
163:         bool shouldFindDerivatives = true;163:         bool shouldFindDerivatives = true;
164:         CudaSafeCall( cudaMemcpyToSymbol(gpu_rigid_bodies::shouldFindDeriv, &shouldFindDerivatives,  sizeof(bool)) );164:         CudaSafeCall( cudaMemcpyToSymbol(gpu_rigid_bodies::shouldFindDeriv, &shouldFindDerivatives,  sizeof(bool)) );
165: 165: 
166:         dim3 blockDim;166:         dim3 blockDim;
167:         blockDim.x = 256;167:         blockDim.x = 256;
168:         dim3 gridDim;168:         dim3 gridDim;
169:         gridDim.x = (m_nRigidBody + blockDim.x - 1)/blockDim.x;169:         gridDim.x = (m_nRigidBody + blockDim.x - 1)/blockDim.x;
170: 170: 
171:         // Calculation of derivates of rotation matrices - one thread per rigid body. 171:         // Calculation of rotation matrices - one thread per rigid body. 
172:         gpu_rigid_bodies::gradTransform1a<<<gridDim, blockDim>>>(m_d_xRigid, d_grmi1, d_grmi2, d_grmi3, m_d_nRigidBody);172:         gpu_rigid_bodies::gradTransform1a<<<gridDim, blockDim>>>(m_d_xRigid, d_grmi1, d_grmi2, d_grmi3, m_d_nRigidBody);
173:         CudaCheckError();173:         CudaCheckError();
174:         cudaDeviceSynchronize();174:         cudaDeviceSynchronize();
175: 175: 
176:         // Reduction - projection of atomistic forces onto translational degrees of freedom of each rigid body.176:         // Reduction - projection of atomistic forces onto translational degrees of freedom of each rigid body.
177:         blockDim.x = 1024;177:         blockDim.x = 1024;
178:         gridDim.x = (32*m_nRigidBody + blockDim.x - 1)/blockDim.x; // 32 is warpSize178:         gridDim.x = (32*m_nRigidBody + blockDim.x - 1)/blockDim.x; // 32 is warpSize
179:         // Reduction only takes place for rigid bodies with 32 or fewer sites. 179:         // Reduction only takes place for rigid bodies with 32 or fewer sites. 
180:         gpu_rigid_bodies::warpReduce1<<<gridDim, blockDim>>>(m_d_nRigidBody, m_d_nRigidSitesPerBody, m_d_rigidGroups, m_d_rigidMaxSite, 180:         gpu_rigid_bodies::warpReduce1<<<gridDim, blockDim>>>(m_d_nRigidBody, m_d_nRigidSitesPerBody, m_d_rigidGroups, m_d_rigidMaxSite, 
181:                         d_gk, m_d_gkRigid);181:                         d_gk, m_d_gkRigid);


legend
Lines Added 
Lines changed
 Lines Removed

hdiff - version: 2.1.0