diff --git a/README.md b/README.md index d63a6a1..05382cc 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,200 @@ **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 1 - Flocking** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +![Boids Cover Image](images/Boids%20Cover.PNG) -### (TODO: Your README) +* Megan Reddy + * [LinkedIn](https://www.linkedin.com/in/meganr25a949125/), [personal website](https://meganr28.github.io/) +* Tested on: Windows 10, AMD Ryzen 9 5900HS with Radeon Graphics @ 3301 MHz 16GB, NVIDIA GeForce RTX 3060 Laptop GPU 6GB (Personal Computer) +* Compute Capability: 8.6 -Include screenshots, analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +### Overview + +| Coherent - 5000 boids | Coherent - 50000 boids | +:-------------------------:|:-------------------------: +![Boids 5,000 GIF](images/Boids%205k.gif) | ![Boids 50,000 GIF](images/Boids%2050k.gif) + +[Boids](https://en.wikipedia.org/wiki/Boids) is an artificial life simulation developed by Craig Reynolds in 1986. The simulation represents the flocking behavior of birds or fish (boids). + +This behavior follows three rules: +1. Cohesion - boids move towards the perceived center of mass of their neighbors +2. Separation - boids avoid getting to close to their neighbors +3. Alignment - boids generally try to move with the same direction and speed as +their neighbors + +These three rules determine the **velocity change** for a particular boid at a certain timestep. Thus, the contribution from each rule must be computed and added to the boid's current velocity. +For the Naive case, each boid must be checked against every other boid in the simulation. + +Here is psuedocode for computing the velocity contribution from each of the three rules: + +#### Rule 1: Boids try to fly towards the centre of mass of neighbouring boids + +``` +function rule1(Boid boid) + + Vector perceived_center + + foreach Boid b: + if b != boid and distance(b, boid) < rule1Distance then + perceived_center += b.position + endif + end + + perceived_center /= number_of_neighbors + + return (perceived_center - boid.position) * rule1Scale +end +``` + +#### Rule 2: Boids try to keep a small distance away from other objects (including other boids). + +``` +function rule2(Boid boid) + + Vector c = 0 + + foreach Boid b + if b != boid and distance(b, boid) < rule2Distance then + c -= (b.position - boid.position) + endif + end + + return c * rule2Scale +end +``` + +#### Rule 3: Boids try to match velocity with near boids. + +``` +function rule3(Boid boid) + + Vector perceived_velocity + + foreach Boid b + if b != boid and distance(b, boid) < rule3Distance then + perceived_velocity += b.velocity + endif + end + + perceived_velocity /= number_of_neighbors + + return perceived_velocity * rule3Scale +end +``` + +#### Compute the boid's new velocity and position + +``` +function computeNewVelocity(Boid boid) + + v1 = rule1(boid); + v2 = rule2(boid); + v3 = rule3(boid); + + boid.velocity += v1 + v2 + v3; + boid.position += boid.velocity; +end +``` + +#### Optimizations + +The method presented in the previous section can be inefficient, especially with a large number of boids and a small neighborhood distance. +One way to avoid checking every other boid is to "bin" each particle into a uniform grid. This way, we can limit the search to a certain number of surrounding grid cells. + +In 3D: + +* If `cell width` is 2x the neighborhood distance, you have to check 8 cells +* If `cell width` is 1x the neighborhood distance, you have to check 27 cells + +2D Visualization: + +![2D Uniform Grid Neighbor Search](images/Boids%20Ugrid%20neighbor%20search%20shown.png) + +##### Scattered Uniform Grid + +Calculating the uniform grid consists of these steps: + +1. Assign each boid a `gridCellIndex`. +2. Sort the list of boids by `gridCellIndex` so that all boids in a cell are contiguous in memory. +3. In separate arrays, store the `start` and `end` indices of a particular grid cell. +4. Depending on the position of the boid within its cell, loop through possible neighbors and compare against boids in those cells. + +The reason this version is called the "scattered" uniform grid is because the actual position and velocity data is not arranged contiguously in memory. +This can be fixed using the method in the following section. + +##### Coherent Uniform Grid + +To make the position and velocity data coherent, we rearrange the data to match the order of the shuffled boid indices. +This allows us to have one less memory access in the neighbor search loop since the position and velocity data can be indexed +directly with `gridCellStartIndices` and `gridCellEndIndices`. + +### Performance Analysis + +To analyze the performance of each implementation (`Naive`, `Scattered Uniform Grid`, `Coherent Uniform Grid`), the frame rate (fps) was measured over 50 seconds. +These numbers were averaged to obtain the **average frame rate** for each data point on the following graphs. Additionally, each graph indicates whether +visualization was turned on (meaning boids were rendered on the screen) or off (just the simulation). + +The follow data was recorded in **Release Mode** with **Vertical Sync** turned off. + +#### Number of Boids vs. FPS (No visual) + +![Number of Boids vs. FPS (No Vis)](images/Boids%20FPS%20No%20Vis.png) + +#### Number of Boids vs. FPS (Visual) + +![Number of Boids vs. FPS (Vis)](images/Boids%20FPS%20Vis.png) + +Overall, changing the number of boids decreased the performance of each implementation. +The Scattered and Coherent Grids performed faster overall despite the performance decrease +due to limiting neighbor checks. + +#### Block Size vs. FPS (No visual) + +![Block Size vs. FPS (No Vis)](images/Block%20Size%20FPS%20No%20Vis.png) + +#### Block Size vs. FPS (Visual) + +![Block Size vs. FPS (Vis)](images/Block%20Size%20FPS%20Vis.png) + +The block size graphs were measured using a fixed boid number of **50,000**. Block size did not have an obvious impact on performance as the numbers stayed relatively the +same within each implementation. A possible reason for this is discussed in the section below. + +### Questions + +**1. For each implementation, how does changing the number of boids affect +performance? Why do you think this is?** \ +As the number of boids increase, the average FPS/performance per implementation goes down. +This is most likely caused by the higher boid density per cell, making it necessary to check +against more boids as the number increases. Even in optimized implementations such as the +Scattered and Coherent Grids, the number of boids has a significant impact on performance. +However, since the search space is reduced by checking fewer cells, the average fps is higher for those implementations. +It is worth noting that the Naive implementation still achieves a good performance at lower boid counts (5000, 10000), +but sharply decreases afterwards. + +**2. For each implementation, how does changing the block count and block size +affect performance? Why do you think this is?** \ +A GPU's main goal is to hide latency by switching between threads. Similarly, the GPU will keep the +hardware busy by executing other warps (groups of 32 threads) when others are stalled. The number of active warps +relative to the maximum possible number of active warps at a time is known as occupancy. I chose multiples of 32 for +my block sizes in order to maximize occupancy and increase performance, though this is not always the case as seen in the graphs. +There is a slight performance increase until about 256-512 threads are reached for some implementations. Afterwards, the gains +from additional occupancy is negligible. As a whole, increasing block size did not have much of an impact on performance for the +chosen block sizes. If I had chosen block sizes that were not multiples of 32, I might +have seen a decrease in performance due to unused threads and lower occupancy. + +**3. For the coherent uniform grid: did you experience any performance improvements +with the more coherent uniform grid? Was this the outcome you expected? +Why or why not?** \ +Yes, I did see a performance improvement with the coherent uniform grid. I expected this since there were +two memory reads that were removed, thus the performance should have been better. +Reading and writing from memory is slow on the GPU, therefore reducing these accesses +is desirable. Additionally, placing the position and velocity in contiguous memory reduces the number +of random accesses that occur. + +**4. Did changing cell width and checking 27 vs 8 neighboring cells affect performance? +Why or why not? Be careful: it is insufficient (and possibly incorrect) to say +that 27-cell is slower simply because there are more cells to check!** \ +Yes, I noticed a speed increase when running with 27 cells vs. running with 8 cells. This was especially noticeable +when running simulations with more than 10000 boids. When `cellWidth` is greater than `searchRadius`, this means that you have +to search a larger portion of the grid, which will be slower (as is the case with searching 8 neighboring cells). When `cellWidth` +is less than or equal `searchRadius`, the grid resolution is finer, therefore the search space is more confined (as is the case with 27 cells). diff --git a/images/Block Size FPS No Vis.png b/images/Block Size FPS No Vis.png new file mode 100644 index 0000000..e0698b9 Binary files /dev/null and b/images/Block Size FPS No Vis.png differ diff --git a/images/Block Size FPS Vis.png b/images/Block Size FPS Vis.png new file mode 100644 index 0000000..a912531 Binary files /dev/null and b/images/Block Size FPS Vis.png differ diff --git a/images/Boids 50k.gif b/images/Boids 50k.gif new file mode 100644 index 0000000..c05bdf2 Binary files /dev/null and b/images/Boids 50k.gif differ diff --git a/images/Boids 5k.gif b/images/Boids 5k.gif new file mode 100644 index 0000000..3e1c266 Binary files /dev/null and b/images/Boids 5k.gif differ diff --git a/images/Boids Cover.PNG b/images/Boids Cover.PNG new file mode 100644 index 0000000..a630e73 Binary files /dev/null and b/images/Boids Cover.PNG differ diff --git a/images/Boids FPS No Vis.png b/images/Boids FPS No Vis.png new file mode 100644 index 0000000..926a17e Binary files /dev/null and b/images/Boids FPS No Vis.png differ diff --git a/images/Boids FPS Vis.png b/images/Boids FPS Vis.png new file mode 100644 index 0000000..40f6ee9 Binary files /dev/null and b/images/Boids FPS Vis.png differ diff --git a/src/kernel.cu b/src/kernel.cu index 74dffcb..2294b52 100644 --- a/src/kernel.cu +++ b/src/kernel.cu @@ -86,6 +86,10 @@ int *dev_gridCellEndIndices; // to this cell? // TODO-2.3 - consider what additional buffers you might need to reshuffle // the position and velocity data to be coherent within cells. +glm::vec3* dev_rearrangedPosValues; +glm::vec3* dev_rearrangedVel1Values; +glm::vec3* dev_rearrangedVel2Values; + // LOOK-2.1 - Grid parameters based on simulation parameters. // These are automatically computed for you in Boids::initSimulation int gridCellCount; @@ -169,6 +173,27 @@ void Boids::initSimulation(int N) { gridMinimum.z -= halfGridWidth; // TODO-2.1 TODO-2.3 - Allocate additional buffers here. + cudaMalloc((void**)&dev_particleArrayIndices, N * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_particleArrayIndices failed!"); + + cudaMalloc((void**)&dev_particleGridIndices, N * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_particleGridIndices failed!"); + + cudaMalloc((void**)&dev_gridCellStartIndices, gridCellCount * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_gridCellStartIndices failed!"); + + cudaMalloc((void**)&dev_gridCellEndIndices, gridCellCount * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_gridCellEndIndices failed!"); + + cudaMalloc((void**)&dev_rearrangedPosValues, N * sizeof(glm::vec3)); + checkCUDAErrorWithLine("cudaMalloc dev_rearrangedPosValues failed!"); + + cudaMalloc((void**)&dev_rearrangedVel1Values, N * sizeof(glm::vec3)); + checkCUDAErrorWithLine("cudaMalloc dev_rearrangedVel1Values failed!"); + + cudaMalloc((void**)&dev_rearrangedVel2Values, N * sizeof(glm::vec3)); + checkCUDAErrorWithLine("cudaMalloc dev_rearrangedVel2Values failed!"); + cudaDeviceSynchronize(); } @@ -230,10 +255,61 @@ void Boids::copyBoidsToVBO(float *vbodptr_positions, float *vbodptr_velocities) * in the `pos` and `vel` arrays. */ __device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *pos, const glm::vec3 *vel) { - // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves - // Rule 2: boids try to stay a distance d away from each other - // Rule 3: boids try to match the speed of surrounding boids - return glm::vec3(0.0f, 0.0f, 0.0f); + glm::vec3 ruleOnePerceivedCenter(0.0f, 0.0f, 0.0f); // cohesion + glm::vec3 ruleTwoDistance(0.0f, 0.0f, 0.0f); // separation + glm::vec3 ruleThreePerceivedVelocity(0.0f, 0.0f, 0.0f); // alignment + + glm::vec3 ruleOneVelocity(0.0, 0.0, 0.0); + glm::vec3 ruleTwoVelocity(0.0, 0.0, 0.0); + glm::vec3 ruleThreeVelocity(0.0, 0.0, 0.0); + + int ruleOneNeighborCount = 0; + int ruleThreeNeighborCount = 0; + + glm::vec3 boidPos = pos[iSelf]; + glm::vec3 boidVel = vel[iSelf]; + + for (int i = 0; i < N; ++i) + { + if (i == iSelf) continue; + float distance = glm::length(boidPos - pos[i]); + + // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves + if (distance < rule1Distance) + { + ruleOnePerceivedCenter += pos[i]; + ++ruleOneNeighborCount; + } + + // Rule 2: boids try to stay a distance d away from each other + if (distance < rule2Distance) + { + ruleTwoDistance -= pos[i] - boidPos; + } + + // Rule 3: boids try to match the speed of surrounding boids + if (distance < rule3Distance) + { + ruleThreePerceivedVelocity += vel[i]; + ++ruleThreeNeighborCount; + } + } + + if (ruleOneNeighborCount > 0) + { + ruleOnePerceivedCenter /= ruleOneNeighborCount; + ruleOneVelocity = (ruleOnePerceivedCenter - boidPos) * rule1Scale; + } + if (ruleThreeNeighborCount > 0) + { + ruleThreePerceivedVelocity /= ruleThreeNeighborCount; + ruleThreeVelocity = ruleThreePerceivedVelocity * rule3Scale; + } + ruleTwoVelocity = ruleTwoDistance * rule2Scale; + + glm::vec3 velocity_change = boidVel + ruleOneVelocity + ruleTwoVelocity + ruleThreeVelocity; + + return velocity_change; } /** @@ -242,9 +318,20 @@ __device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *po */ __global__ void kernUpdateVelocityBruteForce(int N, glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= N) { + return; + } // Compute a new velocity based on pos and vel1 + glm::vec3 new_velocity = computeVelocityChange(N, index, pos, vel1); // Clamp the speed + float speed = glm::length(new_velocity); + if (speed > maxSpeed) + { + new_velocity = (new_velocity / speed) * maxSpeed; + } // Record the new velocity into vel2. Question: why NOT vel1? + vel2[index] = new_velocity; } /** @@ -285,10 +372,20 @@ __device__ int gridIndex3Dto1D(int x, int y, int z, int gridResolution) { __global__ void kernComputeIndices(int N, int gridResolution, glm::vec3 gridMin, float inverseCellWidth, glm::vec3 *pos, int *indices, int *gridIndices) { + int boidIndex = threadIdx.x + (blockIdx.x * blockDim.x); + if (boidIndex >= N) { + return; + } // TODO-2.1 // - Label each boid with the index of its grid cell. + glm::vec3 boidPos = pos[boidIndex]; + int iX = glm::floor((boidPos.x - gridMin.x) * inverseCellWidth); + int iY = glm::floor((boidPos.y - gridMin.y) * inverseCellWidth); + int iZ = glm::floor((boidPos.z - gridMin.z) * inverseCellWidth); + gridIndices[boidIndex] = gridIndex3Dto1D(iX, iY, iZ, gridResolution); // - Set up a parallel array of integer indices as pointers to the actual // boid data in pos and vel1/vel2 + indices[boidIndex] = boidIndex; } // LOOK-2.1 Consider how this could be useful for indicating that a cell @@ -302,10 +399,34 @@ __global__ void kernResetIntBuffer(int N, int *intBuffer, int value) { __global__ void kernIdentifyCellStartEnd(int N, int *particleGridIndices, int *gridCellStartIndices, int *gridCellEndIndices) { - // TODO-2.1 - // Identify the start point of each cell in the gridIndices array. - // This is basically a parallel unrolling of a loop that goes - // "this index doesn't match the one before it, must be a new cell!" + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= N) { + return; + } + // TODO-2.1 + // Identify the start point of each cell in the gridIndices array. + // This is basically a parallel unrolling of a loop that goes + // "this index doesn't match the one before it, must be a new cell!" + int val0 = particleGridIndices[index]; + int val1 = val0; + if ((index + 1) < N) + { + val1 = particleGridIndices[index + 1]; + } + else + { + gridCellEndIndices[val1] = index; + } + + if (index == 0) { + gridCellStartIndices[val0] = index; + } + + if (val0 != val1) + { + gridCellEndIndices[val0] = index; + gridCellStartIndices[val1] = index + 1; + } } __global__ void kernUpdateVelNeighborSearchScattered( @@ -314,14 +435,153 @@ __global__ void kernUpdateVelNeighborSearchScattered( int *gridCellStartIndices, int *gridCellEndIndices, int *particleArrayIndices, glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) { - // TODO-2.1 - Update a boid's velocity using the uniform grid to reduce - // the number of boids that need to be checked. - // - Identify the grid cell that this particle is in - // - Identify which cells may contain neighbors. This isn't always 8. - // - For each cell, read the start/end indices in the boid pointer array. - // - Access each boid in the cell and compute velocity change from - // the boids rules, if this boid is within the neighborhood distance. - // - Clamp the speed change before putting the new speed in vel2 + // TODO-2.1 - Update a boid's velocity using the uniform grid to reduce + // the number of boids that need to be checked. + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= N) { + return; + } + int boidIndex = particleArrayIndices[index]; + glm::vec3 boidPos = pos[boidIndex]; + glm::vec3 boidVel = vel1[boidIndex]; + + // - Identify the grid cell that this particle is in + // iX, iY, iZ represent the grid cell's 3D index + // fractX, fractY, fractZ help determine where in the cell the boid is located + int iX = glm::floor((boidPos.x - gridMin.x) * inverseCellWidth); + int iY = glm::floor((boidPos.y - gridMin.y) * inverseCellWidth); + int iZ = glm::floor((boidPos.z - gridMin.z) * inverseCellWidth); + float fractX = glm::fract((boidPos.x - gridMin.x) * inverseCellWidth); + float fractY = glm::fract((boidPos.y - gridMin.y) * inverseCellWidth); + float fractZ = glm::fract((boidPos.z - gridMin.z) * inverseCellWidth); + + // Determine which directions to check + int signX = 1; + int signY = 1; + int signZ = 1; + if (fractX < 0.5) signX = -1; + if (fractY < 0.5) signY = -1; + if (fractZ < 0.5) signZ = -1; + + // Initialize variables for calculating velocity change rules + glm::vec3 ruleOnePerceivedCenter(0.0f, 0.0f, 0.0f); // cohesion + glm::vec3 ruleTwoDistance(0.0f, 0.0f, 0.0f); // separation + glm::vec3 ruleThreePerceivedVelocity(0.0f, 0.0f, 0.0f); // alignment + + glm::vec3 ruleOneVelocity(0.0, 0.0, 0.0); + glm::vec3 ruleTwoVelocity(0.0, 0.0, 0.0); + glm::vec3 ruleThreeVelocity(0.0, 0.0, 0.0); + + int ruleOneNeighborCount = 0; + int ruleThreeNeighborCount = 0; + int cellCount = gridResolution * gridResolution * gridResolution; + + // - Identify which cells may contain neighbors. This isn't always 8. + int zstart = imin(iZ, iZ + signZ); + int zend = imax(iZ, iZ + signZ); + int ystart = imin(iY, iY + signY); + int yend = imax(iY, iY + signY); + int xstart = imin(iX, iX + signX); + int xend = imax(iX, iX + signX); + + // Check at most 8 cells surrounding the current cell + for (int z = zstart; z <= zend; ++z) + { + for (int y = ystart; y <= yend; ++y) + { + for (int x = xstart; x <= xend; ++x) + { + int gridCellNeighborIndex = gridIndex3Dto1D(x, y, z, gridResolution); + if (gridCellNeighborIndex < 0 || gridCellNeighborIndex >= cellCount) continue; + + // - Access each boid in the cell and compute velocity change from + // the boids rules, if this boid is within the neighborhood distance. + // - For each cell, read the start/end indices in the boid pointer array. + int start = gridCellStartIndices[gridCellNeighborIndex]; + int end = gridCellEndIndices[gridCellNeighborIndex]; + if (start < 0 || end < 0) continue; + + for (int j = start; j <= end; ++j) + { + int b = particleArrayIndices[j]; + if (b == boidIndex) continue; + float distance = glm::length(boidPos - pos[b]); + + // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves + if (distance < rule1Distance) + { + ruleOnePerceivedCenter += pos[b]; + ++ruleOneNeighborCount; + } + + // Rule 2: boids try to stay a distance d away from each other + if (distance < rule2Distance) + { + ruleTwoDistance -= pos[b] - boidPos; + } + + // Rule 3: boids try to match the speed of surrounding boids + if (distance < rule3Distance) + { + ruleThreePerceivedVelocity += vel1[b]; + ++ruleThreeNeighborCount; + } + } + } + } + } + + if (ruleOneNeighborCount > 0) + { + ruleOnePerceivedCenter /= ruleOneNeighborCount; + ruleOneVelocity = (ruleOnePerceivedCenter - boidPos) * rule1Scale; + } + if (ruleThreeNeighborCount > 0) + { + ruleThreePerceivedVelocity /= ruleThreeNeighborCount; + ruleThreeVelocity = ruleThreePerceivedVelocity * rule3Scale; + } + ruleTwoVelocity = ruleTwoDistance * rule2Scale; + + glm::vec3 new_velocity = boidVel + ruleOneVelocity + ruleTwoVelocity + ruleThreeVelocity; + + // - Clamp the speed change before putting the new speed in vel2 + float speed = glm::length(new_velocity); + if (speed > maxSpeed) + { + new_velocity = (new_velocity / speed) * maxSpeed; + } + vel2[boidIndex] = new_velocity; +} + +__global__ void kernRearrangePosVelBuffers( + int N, int* particleArrayIndices, + glm::vec3* rearrangedPosVals, + glm::vec3* rearrangedVel1Vals, + glm::vec3* pos, glm::vec3* vel1, glm::vec3* vel2) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= N) { + return; + } + int boidIndex = particleArrayIndices[index]; + rearrangedPosVals[index] = pos[boidIndex]; + rearrangedVel1Vals[index] = vel1[boidIndex]; +} + +__global__ void kernWriteBackBuffers( + int N, int* particleArrayIndices, + glm::vec3* rearrangedPosVals, + glm::vec3* rearrangedVel1Vals, + glm::vec3* rearrangedVel2Vals, + glm::vec3* pos, glm::vec3* vel1, glm::vec3* vel2) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= N) { + return; + } + int boidIndex = particleArrayIndices[index]; + pos[boidIndex] = rearrangedPosVals[index]; + vel1[boidIndex] = rearrangedVel1Vals[index]; + vel2[boidIndex] = rearrangedVel2Vals[index]; } __global__ void kernUpdateVelNeighborSearchCoherent( @@ -329,18 +589,125 @@ __global__ void kernUpdateVelNeighborSearchCoherent( float inverseCellWidth, float cellWidth, int *gridCellStartIndices, int *gridCellEndIndices, glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) { - // TODO-2.3 - This should be very similar to kernUpdateVelNeighborSearchScattered, - // except with one less level of indirection. - // This should expect gridCellStartIndices and gridCellEndIndices to refer - // directly to pos and vel1. - // - Identify the grid cell that this particle is in - // - Identify which cells may contain neighbors. This isn't always 8. - // - For each cell, read the start/end indices in the boid pointer array. - // DIFFERENCE: For best results, consider what order the cells should be - // checked in to maximize the memory benefits of reordering the boids data. - // - Access each boid in the cell and compute velocity change from - // the boids rules, if this boid is within the neighborhood distance. - // - Clamp the speed change before putting the new speed in vel2 + // TODO-2.3 - This should be very similar to kernUpdateVelNeighborSearchScattered, + // except with one less level of indirection. + // This should expect gridCellStartIndices and gridCellEndIndices to refer + // directly to pos and vel1. + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= N) { + return; + } + glm::vec3 boidPos = pos[index]; + glm::vec3 boidVel = vel1[index]; + + // - Identify the grid cell that this particle is in + // iX, iY, iZ represent the grid cell's 3D index + // fractX, fractY, fractZ help determine where in the cell the boid is located + int iX = glm::floor((boidPos.x - gridMin.x) * inverseCellWidth); + int iY = glm::floor((boidPos.y - gridMin.y) * inverseCellWidth); + int iZ = glm::floor((boidPos.z - gridMin.z) * inverseCellWidth); + float fractX = glm::fract((boidPos.x - gridMin.x) * inverseCellWidth); + float fractY = glm::fract((boidPos.y - gridMin.y) * inverseCellWidth); + float fractZ = glm::fract((boidPos.z - gridMin.z) * inverseCellWidth); + + // Determine which directions to check + int signX = 1; + int signY = 1; + int signZ = 1; + if (fractX < 0.5) signX = -1; + if (fractY < 0.5) signY = -1; + if (fractZ < 0.5) signZ = -1; + + // Initialize variables for calculating velocity change rules + glm::vec3 ruleOnePerceivedCenter(0.0f, 0.0f, 0.0f); // cohesion + glm::vec3 ruleTwoDistance(0.0f, 0.0f, 0.0f); // separation + glm::vec3 ruleThreePerceivedVelocity(0.0f, 0.0f, 0.0f); // alignment + + glm::vec3 ruleOneVelocity(0.0, 0.0, 0.0); + glm::vec3 ruleTwoVelocity(0.0, 0.0, 0.0); + glm::vec3 ruleThreeVelocity(0.0, 0.0, 0.0); + + int ruleOneNeighborCount = 0; + int ruleThreeNeighborCount = 0; + int cellCount = gridResolution * gridResolution * gridResolution; + + // - Identify which cells may contain neighbors. This isn't always 8. + int zstart = imin(iZ, iZ + signZ); + int zend = imax(iZ, iZ + signZ); + int ystart = imin(iY, iY + signY); + int yend = imax(iY, iY + signY); + int xstart = imin(iX, iX + signX); + int xend = imax(iX, iX + signX); + + // Check at most 8 cells surrounding the current cell + for (int z = zstart; z <= zend; ++z) + { + for (int y = ystart; y <= yend; ++y) + { + for (int x = xstart; x <= xend; ++x) + { + int gridCellNeighborIndex = gridIndex3Dto1D(x, y, z, gridResolution); + if (gridCellNeighborIndex < 0 || gridCellNeighborIndex >= cellCount) continue; + + // - For each cell, read the start/end indices in the boid pointer array. + // DIFFERENCE: For best results, consider what order the cells should be + // checked in to maximize the memory benefits of reordering the boids data. + int start = gridCellStartIndices[gridCellNeighborIndex]; + int end = gridCellEndIndices[gridCellNeighborIndex]; + if (start < 0 || end < 0) continue; + + // - Access each boid in the cell and compute velocity change from + // the boids rules, if this boid is within the neighborhood distance. + for (int j = start; j <= end; ++j) + { + if (j == index) continue; + float distance = glm::length(boidPos - pos[j]); + + // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves + if (distance < rule1Distance) + { + ruleOnePerceivedCenter += pos[j]; + ++ruleOneNeighborCount; + } + + // Rule 2: boids try to stay a distance d away from each other + if (distance < rule2Distance) + { + ruleTwoDistance -= pos[j] - boidPos; + } + + // Rule 3: boids try to match the speed of surrounding boids + if (distance < rule3Distance) + { + ruleThreePerceivedVelocity += vel1[j]; + ++ruleThreeNeighborCount; + } + } + } + } + } + + if (ruleOneNeighborCount > 0) + { + ruleOnePerceivedCenter /= ruleOneNeighborCount; + ruleOneVelocity = (ruleOnePerceivedCenter - boidPos) * rule1Scale; + } + if (ruleThreeNeighborCount > 0) + { + ruleThreePerceivedVelocity /= ruleThreeNeighborCount; + ruleThreeVelocity = ruleThreePerceivedVelocity * rule3Scale; + } + ruleTwoVelocity = ruleTwoDistance * rule2Scale; + + glm::vec3 new_velocity = boidVel + ruleOneVelocity + ruleTwoVelocity + ruleThreeVelocity; + + // - Clamp the speed change before putting the new speed in vel2 + float speed = glm::length(new_velocity); + if (speed > maxSpeed) + { + new_velocity = (new_velocity / speed) * maxSpeed; + } + vel2[index] = new_velocity; } /** @@ -348,7 +715,11 @@ __global__ void kernUpdateVelNeighborSearchCoherent( */ void Boids::stepSimulationNaive(float dt) { // TODO-1.2 - use the kernels you wrote to step the simulation forward in time. + dim3 blocksPerGrid((numObjects + blockSize - 1) / blockSize); + kernUpdateVelocityBruteForce << > > (numObjects, dev_pos, dev_vel1, dev_vel2); + kernUpdatePos << > > (numObjects, dt, dev_pos, dev_vel2); // TODO-1.2 ping-pong the velocity buffers + std::swap(dev_vel1, dev_vel2); } void Boids::stepSimulationScatteredGrid(float dt) { @@ -357,13 +728,31 @@ void Boids::stepSimulationScatteredGrid(float dt) { // In Parallel: // - label each particle with its array index as well as its grid index. // Use 2x width grids. + dim3 blocksPerGrid((numObjects + blockSize - 1) / blockSize); + kernComputeIndices << > > (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, dev_pos, dev_particleArrayIndices, dev_particleGridIndices); + // - Unstable key sort using Thrust. A stable sort isn't necessary, but you // are welcome to do a performance comparison. + thrust::device_ptr dev_thrustArrayIndices(dev_particleArrayIndices); + thrust::device_ptr dev_thrustGridIndices(dev_particleGridIndices); + thrust::sort_by_key(dev_thrustGridIndices, dev_thrustGridIndices + numObjects, dev_thrustArrayIndices); + // - Naively unroll the loop for finding the start and end indices of each // cell's data pointers in the array of boid indices + // Initialize start and end index arrays to -1 (signifying that cells do not enclose any boids) + dim3 blocksPerCellGrid((gridCellCount + blockSize - 1) / blockSize); + kernResetIntBuffer << > > (gridCellCount, dev_gridCellStartIndices, -1); + kernResetIntBuffer << > > (gridCellCount, dev_gridCellEndIndices, -1); + kernIdentifyCellStartEnd << > > (numObjects, dev_particleGridIndices, dev_gridCellStartIndices, dev_gridCellEndIndices); + // - Perform velocity updates using neighbor search + kernUpdateVelNeighborSearchScattered << > > (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, gridCellWidth, dev_gridCellStartIndices, dev_gridCellEndIndices, dev_particleArrayIndices, dev_pos, dev_vel1, dev_vel2); + // - Update positions + kernUpdatePos << > > (numObjects, dt, dev_pos, dev_vel2); + // - Ping-pong buffers as needed + std::swap(dev_vel1, dev_vel2); } void Boids::stepSimulationCoherentGrid(float dt) { @@ -372,16 +761,36 @@ void Boids::stepSimulationCoherentGrid(float dt) { // In Parallel: // - Label each particle with its array index as well as its grid index. // Use 2x width grids + dim3 blocksPerGrid((numObjects + blockSize - 1) / blockSize); + kernComputeIndices << > > (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, dev_pos, dev_particleArrayIndices, dev_particleGridIndices); + // - Unstable key sort using Thrust. A stable sort isn't necessary, but you // are welcome to do a performance comparison. + thrust::device_ptr dev_thrustArrayIndices(dev_particleArrayIndices); + thrust::device_ptr dev_thrustGridIndices(dev_particleGridIndices); + thrust::sort_by_key(dev_thrustGridIndices, dev_thrustGridIndices + numObjects, dev_thrustArrayIndices); + // - Naively unroll the loop for finding the start and end indices of each // cell's data pointers in the array of boid indices + dim3 blocksPerCellGrid((gridCellCount + blockSize - 1) / blockSize); + kernResetIntBuffer << > > (gridCellCount, dev_gridCellStartIndices, -1); + kernResetIntBuffer << > > (gridCellCount, dev_gridCellEndIndices, -1); + kernIdentifyCellStartEnd << > > (numObjects, dev_particleGridIndices, dev_gridCellStartIndices, dev_gridCellEndIndices); + // - BIG DIFFERENCE: use the rearranged array index buffer to reshuffle all // the particle data in the simulation array. // CONSIDER WHAT ADDITIONAL BUFFERS YOU NEED + kernRearrangePosVelBuffers << > > (numObjects, dev_particleArrayIndices, dev_rearrangedPosValues, dev_rearrangedVel1Values, dev_pos, dev_vel1, dev_vel2); + // - Perform velocity updates using neighbor search + kernUpdateVelNeighborSearchCoherent << > > (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, gridCellWidth, dev_gridCellStartIndices, dev_gridCellEndIndices, dev_rearrangedPosValues, dev_rearrangedVel1Values, dev_rearrangedVel2Values); + // - Update positions + kernWriteBackBuffers<<>>(numObjects, dev_particleArrayIndices, dev_rearrangedPosValues, dev_rearrangedVel1Values, dev_rearrangedVel2Values, dev_pos, dev_vel1, dev_vel2); + kernUpdatePos << > > (numObjects, dt, dev_pos, dev_vel2); + // - Ping-pong buffers as needed. THIS MAY BE DIFFERENT FROM BEFORE. + std::swap(dev_vel1, dev_vel2); } void Boids::endSimulation() { @@ -390,6 +799,14 @@ void Boids::endSimulation() { cudaFree(dev_pos); // TODO-2.1 TODO-2.3 - Free any additional buffers here. + cudaFree(dev_particleArrayIndices); + cudaFree(dev_particleGridIndices); + cudaFree(dev_gridCellStartIndices); + cudaFree(dev_gridCellEndIndices); + + cudaFree(dev_rearrangedPosValues); + cudaFree(dev_rearrangedVel1Values); + cudaFree(dev_rearrangedVel2Values); } void Boids::unitTest() { diff --git a/src/main.cpp b/src/main.cpp index b82c8c6..df1624b 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -220,6 +220,11 @@ void initShaders(GLuint * program) { Boids::unitTest(); // LOOK-1.2 We run some basic example code to make sure // your CUDA development setup is ready to go. + // For performance analysis + double fpsTotal = 0; + int frameSteps = 0; + double startTime = glfwGetTime(); + while (!glfwWindowShouldClose(window)) { glfwPollEvents(); @@ -241,6 +246,11 @@ void initShaders(GLuint * program) { ss << " fps] " << deviceName; glfwSetWindowTitle(window, ss.str().c_str()); + if (time - startTime < 50.0) { + fpsTotal += fps; + ++frameSteps; + } + glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT); #if VISUALIZE @@ -256,6 +266,7 @@ void initShaders(GLuint * program) { glfwSwapBuffers(window); #endif } + std::cout << "Average FPS: " << fpsTotal / frameSteps << std::endl; glfwDestroyWindow(window); glfwTerminate(); }