diff --git a/Part1/.ycm_extra_conf.py b/Part1/.ycm_extra_conf.py new file mode 100644 index 0000000..70537d7 --- /dev/null +++ b/Part1/.ycm_extra_conf.py @@ -0,0 +1,139 @@ +# This file is NOT licensed under the GPLv3, which is the license for the rest +# of YouCompleteMe. +# +# Here's the license text for this file: +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. +# +# For more information, please refer to + +import os +import ycm_core + +# These are the compilation flags that will be used in case there's no +# compilation database set (by default, one is not set). +# CHANGE THIS LIST OF FLAGS. YES, THIS IS THE DROID YOU HAVE BEEN LOOKING FOR. +flags = [ +'-lglut', +'-lGL', +'-lGLEW', +'-Wall', +'-Wextra', +'-Werror', +'-Wc++98-compat', +'-Wno-long-long', +'-Wno-variadic-macros', +'-fexceptions', +'-DNDEBUG', +'-DUSE_CLANG_COMPLETER', +# THIS IS IMPORTANT! Without a "-std=" flag, clang won't know which +# language to use when compiling headers. So it will guess. Badly. So C++ +# headers will be compiled as C headers. You don't want that so ALWAYS specify +# a "-std=". +# For a C project, you would set this to something like 'c99' instead of +# 'c++11'. +'-std=c++11', +# ...and the same thing goes for the magic -x option which specifies the +# language that the files to be compiled are written in. This is mostly +# relevant for c++ headers. +# For a C project, you would set this to 'c' instead of 'c++'. +'-x', +'c++', +'-I/usr/local/cuda/samples/common/inc', +'-I/usr/local/cuda/include' +] + +# Set this to the absolute path to the folder (NOT the file!) containing the +# compile_commands.json file to use that instead of 'flags'. See here for +# more details: http://clang.llvm.org/docs/JSONCompilationDatabase.html +# +# Most projects will NOT need to set this to anything; you can just change the +# 'flags' list of compilation flags. Notice that YCM itself uses that approach. +compilation_database_folder = '' + +if compilation_database_folder: + database = ycm_core.CompilationDatabase( compilation_database_folder ) +else: + database = None + + +def DirectoryOfThisScript(): + return os.path.dirname( os.path.abspath( __file__ ) ) + + +def MakeRelativePathsInFlagsAbsolute( flags, working_directory ): + if not working_directory: + return list( flags ) + new_flags = [] + make_next_absolute = False + path_flags = [ '-isystem', '-I', '-iquote', '--sysroot=' ] + for flag in flags: + new_flag = flag + + if make_next_absolute: + make_next_absolute = False + if not flag.startswith( '/' ): + new_flag = os.path.join( working_directory, flag ) + + for path_flag in path_flags: + if flag == path_flag: + make_next_absolute = True + break + + if flag.startswith( path_flag ): + path = flag[ len( path_flag ): ] + new_flag = path_flag + os.path.join( working_directory, path ) + break + + if new_flag: + new_flags.append( new_flag ) + return new_flags + + +def FlagsForFile( filename ): + if database: + # Bear in mind that compilation_info.compiler_flags_ does NOT return a + # python list, but a "list-like" StringVec object + compilation_info = database.GetCompilationInfoForFile( filename ) + final_flags = MakeRelativePathsInFlagsAbsolute( + compilation_info.compiler_flags_, + compilation_info.compiler_working_dir_ ) + + # NOTE: This is just for YouCompleteMe; it's highly likely that your project + # does NOT need to remove the stdlib flag. DO NOT USE THIS IN YOUR + # ycm_extra_conf IF YOU'RE NOT 100% YOU NEED IT. + ''' + try: + final_flags.remove( '-stdlib=libc++' ) + except ValueError: + pass + ''' + else: + relative_to = DirectoryOfThisScript() + final_flags = MakeRelativePathsInFlagsAbsolute( flags, relative_to ) + + return { + 'flags': final_flags, + 'do_cache': True + } diff --git a/Part1/PROJ_NIX/shaders/heightFS.glsl b/Part1/PROJ_NIX/shaders/heightFS.glsl index e36d53e..4d948ec 100644 --- a/Part1/PROJ_NIX/shaders/heightFS.glsl +++ b/Part1/PROJ_NIX/shaders/heightFS.glsl @@ -1,4 +1,19 @@ +/* void main(void) { gl_FragColor = vec4(0.05,0.15,0.3,1.0); } +*/ + +varying vec2 v_Texcoords; +varying float f_height; + +void main(void) +{ + float shade = (1.0-2.0*sqrt(f_height)); + float alpha = float(mod(v_Texcoords.x+0.025, 0.05) > 0.046 || mod(v_Texcoords.y+0.025, 0.05) > 0.046); + vec4 color = mix(vec4(0.05,0.15,0.3,1.0), vec4(0.05, 0.3, 0.4, 1.0), alpha); + + //vec4 color = vec4(0.05,0.15,0.3,1.0); + gl_FragColor = shade*color; +} diff --git a/Part1/PROJ_NIX/shaders/heightVS.glsl b/Part1/PROJ_NIX/shaders/heightVS.glsl index eda1b93..032f050 100644 --- a/Part1/PROJ_NIX/shaders/heightVS.glsl +++ b/Part1/PROJ_NIX/shaders/heightVS.glsl @@ -1,3 +1,4 @@ +/* uniform mat4 u_projMatrix; attribute vec4 Position; @@ -7,3 +8,23 @@ void main(void) pos.z += 0.01; gl_Position = pos; } +*/ + +uniform mat4 u_projMatrix; +uniform sampler2D u_height; + +attribute vec4 Position; +attribute vec2 Texcoords; + +varying vec2 v_Texcoords; +varying float f_height; + +void main(void) +{ + v_Texcoords = Texcoords; + vec4 pos = Position; + f_height = texture2D(u_height, Texcoords).w; + pos.z = -0.01-clamp(f_height,0.0,2.0); + pos = u_projMatrix * pos; + gl_Position = pos; +} diff --git a/Part1/PROJ_NIX/shaders/planetFS.glsl b/Part1/PROJ_NIX/shaders/planetFS.glsl index e2c1350..9752dc7 100644 --- a/Part1/PROJ_NIX/shaders/planetFS.glsl +++ b/Part1/PROJ_NIX/shaders/planetFS.glsl @@ -1,4 +1,36 @@ +/* void main(void) { gl_FragColor = vec4(1.0); } +*/ + +#version 330 + +in vec3 WorldCoord; +in vec3 ToCam; +in vec3 Up; +in vec3 Right; +in vec2 TexCoord; +out vec4 FragColor; + +void main() +{ + vec2 coord = 2.01 * (TexCoord - vec2(0.5)); + float r = length(coord); + if (r >= 1.0) { discard; } + + float dist = length(WorldCoord); + if(dist <= 0.01) + { + FragColor = vec4(1.0); + return; + } + + vec3 N = Right*-coord.x + Up*coord.y + ToCam*sqrt(1-r*r); + vec3 L = normalize(-WorldCoord); + float light = 0.1 + 0.9*clamp(dot(N,L),0.0, 1.0)*exp(-dist); + //vec3 color = vec3(0.4, 0.1, 0.6); + vec3 color = vec3(1.0, 0.8, 1.0); + FragColor = vec4(color*light,1.0); +} diff --git a/Part1/PROJ_NIX/shaders/planetGS.glsl b/Part1/PROJ_NIX/shaders/planetGS.glsl index 88027d3..8f841c2 100644 --- a/Part1/PROJ_NIX/shaders/planetGS.glsl +++ b/Part1/PROJ_NIX/shaders/planetGS.glsl @@ -1,5 +1,5 @@ #version 330 - +/* uniform mat4 u_projMatrix; layout (points) in; @@ -13,3 +13,54 @@ void main() EmitVertex(); EndPrimitive(); } + +*/ + +uniform mat4 u_projMatrix; +uniform vec3 u_cameraPos; + +layout (points) in; +layout (triangle_strip) out; +layout (max_vertices = 4) out; + +out vec3 WorldCoord; +out vec3 ToCam; +out vec3 Up; +out vec3 Right; +out vec2 TexCoord; + +//const float scale = 0.03; +const float scale = 0.01; + +void main() +{ + vec3 Position = gl_in[0].gl_Position.xyz; + WorldCoord = Position; + + ToCam = normalize(u_cameraPos - Position); + Up = vec3(0.0, 0.0, 1.0); + Right = cross(ToCam, Up); + Up = cross(Right, ToCam); + + vec3 Pos = Position + scale*Right - scale*Up; + gl_Position = u_projMatrix * vec4(Pos, 1.0); + TexCoord = vec2(0.0, 0.0); + EmitVertex(); + + Pos = Position + scale*Right + scale*Up; + gl_Position = u_projMatrix * vec4(Pos, 1.0); + TexCoord = vec2(0.0, 1.0); + EmitVertex(); + + Pos = Position - scale*Right - scale*Up; + gl_Position = u_projMatrix * vec4(Pos, 1.0); + TexCoord = vec2(1.0, 0.0); + EmitVertex(); + + Pos = Position - scale*Right + scale*Up; + gl_Position = u_projMatrix * vec4(Pos, 1.0); + TexCoord = vec2(1.0, 1.0); + EmitVertex(); + + EndPrimitive(); +} diff --git a/Part1/resources/particle_flock_no_spin.png b/Part1/resources/particle_flock_no_spin.png new file mode 100644 index 0000000..29453a2 Binary files /dev/null and b/Part1/resources/particle_flock_no_spin.png differ diff --git a/Part1/resources/particles_flocking.png b/Part1/resources/particles_flocking.png new file mode 100644 index 0000000..c770426 Binary files /dev/null and b/Part1/resources/particles_flocking.png differ diff --git a/Part1/resources/planets.png b/Part1/resources/planets.png new file mode 100644 index 0000000..75236e5 Binary files /dev/null and b/Part1/resources/planets.png differ diff --git a/Part1/src/kernel.cu b/Part1/src/kernel.cu index 32b3cb1..21aafd4 100644 --- a/Part1/src/kernel.cu +++ b/Part1/src/kernel.cu @@ -5,23 +5,26 @@ #include "utilities.h" #include "kernel.h" -#if SHARED == 1 +#if SHARED == 1 #define ACC(x,y,z) sharedMemAcc(x,y,z) #else #define ACC(x,y,z) naiveAcc(x,y,z) #endif +#define FLOCKING 0 + //GLOBALS dim3 threadsPerBlock(blockSize); int numObjects; const float planetMass = 3e8; -const __device__ float starMass = 5e10; +const __device__ float starMass = 5e9; -const float scene_scale = 2e2; //size of the height map in simulation space +const float scene_scale = 2e1; //size of the height map in simulation space glm::vec4 * dev_pos; glm::vec3 * dev_vel; +glm::vec3 * dev_acc; void checkCUDAError(const char *msg, int line = -1) { @@ -69,7 +72,8 @@ void generateRandomPosArray(int time, int N, glm::vec4 * arr, float scale, float glm::vec3 rand = scale*(generateRandomNumberFromThread(time, index)-0.5f); arr[index].x = rand.x; arr[index].y = rand.y; - arr[index].z = 0.0f;//rand.z; + //arr[index].z = 0.0f;//rand.z; + arr[index].z = rand.z; arr[index].w = mass; } } @@ -85,7 +89,7 @@ void generateCircularVelArray(int time, int N, glm::vec3 * arr, glm::vec4 * pos) glm::vec3 R = glm::vec3(pos[index].x, pos[index].y, pos[index].z); float r = glm::length(R) + EPSILON; float s = sqrt(G*starMass/r); - glm::vec3 D = glm::normalize(glm::cross(R/r,glm::vec3(0,0,1))); + glm::vec3 D = 10.0f*glm::normalize(glm::cross(R/r,glm::vec3(0,0,1))); arr[index].x = s*D.x; arr[index].y = s*D.y; arr[index].z = s*D.z; @@ -102,11 +106,12 @@ void generateRandomVelArray(int time, int N, glm::vec3 * arr, float scale) glm::vec3 rand = scale*(generateRandomNumberFromThread(time, index) - 0.5f); arr[index].x = rand.x; arr[index].y = rand.y; - arr[index].z = 0.0;//rand.z; + arr[index].z = 0.0; + //arr[index].z = rand.z; } } -//TODO: Determine force between two bodies +//DONE: Determine force between two bodies __device__ glm::vec3 calculateAcceleration(glm::vec4 us, glm::vec4 them) { @@ -117,38 +122,203 @@ glm::vec3 calculateAcceleration(glm::vec4 us, glm::vec4 them) // G*m_us*m_them G*m_them //a = ------------- = -------- // m_us*r^2 r^2 - - return glm::vec3(0.0f); + + float m_them = them.w; + + glm::vec3 d = glm::vec3(us.x, us.y, us.z) - glm::vec3(them.x, them.y, them.z); + float r2 = glm::dot( d, d ); + + // EPSILON softening-factor + float a = -G*m_them/(r2 + 1e-1); + return a*glm::normalize( d ); } -//TODO: Core force calc kernel global memory +//DONE: Core force calc kernel global memory __device__ glm::vec3 naiveAcc(int N, glm::vec4 my_pos, glm::vec4 * their_pos) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + // Calculate acceleration from star glm::vec3 acc = calculateAcceleration(my_pos, glm::vec4(0,0,0,starMass)); + // Calculate accelerations from other planets + for ( int i=0; i < N; ++i ) { + if ( i == index ) + continue; + acc += calculateAcceleration(my_pos, their_pos[i]); + } return acc; } -//TODO: Core force calc kernel shared memory +//DONE: Core force calc kernel shared memory __device__ glm::vec3 sharedMemAcc(int N, glm::vec4 my_pos, glm::vec4 * their_pos) { + __shared__ glm::vec4 shared_their_pos[blockSize]; glm::vec3 acc = calculateAcceleration(my_pos, glm::vec4(0,0,0,starMass)); + + int index = threadIdx.x + (blockIdx.x * blockDim.x); + // Copy a segment of positions from global to shared memory + int num_iter = 0; + for ( int i = 0; i < N; i += blockDim.x ) { + // Compute global memory index to pull in + int gbl_index = threadIdx.x + (num_iter * blockDim.x); + num_iter++; + + shared_their_pos[threadIdx.x] = their_pos[gbl_index]; + // Don't forget to sync after the copy + __syncthreads(); + + + // Calculate accelerations from other planets using from shared mem + for ( int j=0; j < blockDim.x; j++ ) { + if ( i+j != index ) + acc += calculateAcceleration(my_pos, shared_their_pos[j]); + } + // Sync before next copy + __syncthreads(); + } return acc; + //return calculateAcceleration(my_pos, glm::vec4(0,0,0,starMass)); } +//DONE +__device__ +glm::vec3 Alignment( int N, glm::vec4 my_pos, glm::vec4* pos, glm::vec3* vel ) +{ + glm::vec3 ave_vel; + float r2; + glm::vec3 d; + int index = threadIdx.x + (blockIdx.x * blockDim.x); + // Compute average velocity + // Compute average position + int cnt; + for ( int i=0; i < N; ++i ) { + if ( i == index ) + continue; + d = glm::vec3( pos[i].x-my_pos.x, pos[i].y-my_pos.y, pos[i].z-my_pos.z); + r2 = glm::dot( d, d ); + if ( r2 < 5.0 ) { + ave_vel += glm::vec3( vel[i].x, vel[i].y, vel[i].z ); + cnt ++; + } + } + ave_vel= ave_vel/float(cnt); + + return ave_vel; + //return glm::vec3(0.0, 0.0, 0.0); +} + +//DONE +__device__ +glm::vec3 Cohesion( int N, glm::vec4 my_pos, glm::vec4* pos) +{ + glm::vec3 ave_pos; + float r2; + glm::vec3 d; + int index = threadIdx.x + (blockIdx.x * blockDim.x); + int cnt = 0; + // Compute average position weighted by distance + for ( int i=0; i < N; ++i ) { + if ( i == index ) + continue; + d = glm::vec3( pos[i].x-my_pos.x, pos[i].y-my_pos.y, pos[i].z-my_pos.z); + r2 = glm::dot( d, d ); + if ( r2 < 5.0 ) { + cnt++; + ave_pos += glm::vec3( pos[i].x, pos[i].y, pos[i].z ); + } + } + ave_pos = ave_pos/float(cnt); + + d = glm::vec3(ave_pos.x-my_pos.x, ave_pos.y-my_pos.y, ave_pos.z-my_pos.z); + + //float r = glm::length(d) + EPSILON; + //float s = sqrt(1.0f/r); + //glm::vec3 D = glm::normalize(glm::cross(d/r,glm::vec3(0,0,1))); + + //return s*D; + + return glm::normalize(d); +} + +//DONE +__device__ +glm::vec3 Seperation( int N, glm::vec4 my_pos, glm::vec4* pos ) +{ + + int index = threadIdx.x + (blockIdx.x * blockDim.x); + + glm::vec3 acc; + glm::vec3 d; + float r2; + // Compute repulsion force + for ( int i=0; i < N; ++i ) { + if ( i == index ) + continue; + d = glm::vec3( pos[i].x-my_pos.x, pos[i].y-my_pos.y, pos[i].z-my_pos.z); + r2 = glm::dot( d, d ); + if ( r2 < 1.0 ) + acc += -glm::normalize(d); + } + return acc; +} + +//Simple Euler integration scheme +__global__ +void updateF(int N, float dt, glm::vec4 * pos, glm::vec3 * vel, glm::vec3 * acc) +{ + int index = threadIdx.x + (blockIdx.x * blockDim.x); + glm::vec4 my_pos; + glm::vec3 accel; + + if(index < N) my_pos = pos[index]; + + accel = ACC(N, my_pos, pos); + + if(index < N) acc[index] = accel; +} //Simple Euler integration scheme __global__ -void update(int N, float dt, glm::vec4 * pos, glm::vec3 * vel) +void updateS(int N, float dt, glm::vec4 * pos, glm::vec3 * vel, glm::vec3 * acc, glm::vec3 star_position ) { int index = threadIdx.x + (blockIdx.x * blockDim.x); if( index < N ) { glm::vec4 my_pos = pos[index]; - glm::vec3 acc = ACC(N, my_pos, pos); - vel[index] += acc * dt; + + #if FLOCKING == 0 + //glm::vec3 acc = ACC(N, my_pos, pos); + vel[index] += acc[index] * dt; + #else + // Align velocity with flock average + glm::vec3 align_vel = Alignment( N, my_pos, pos, vel ); + // Attract towards flock average + glm::vec3 cohesion_vel = Cohesion( N, my_pos, pos ); + // Repel from nearby objects + glm::vec3 seperation_vel = Seperation( N, my_pos, pos ); + + glm::vec3 weights = glm::vec3( 0.8, 0.2, 0.5 ); + // Need some weights + vel[index] = weights.x*align_vel + weights.y*cohesion_vel + weights.z*seperation_vel; + + // Add in circular velocity around star + //glm::vec3 star_position( 1.0, 0.0, 0.0 ); + glm::vec3 R = glm::vec3(pos[index].x-star_position.x, pos[index].y-star_position.y, pos[index].z-star_position.z); + float r = glm::length(R) + EPSILON; + float s = sqrt(G*starMass/r); + glm::vec3 D = glm::normalize(glm::cross(R/r,glm::vec3(0,0,1))); + vel[index] += 1.0f*D; + + // Add in attractive velocity toward star + vel[index] += -0.01f*R; + + // Add in damping + //vel[index] *= 0.7f; + + #endif + pos[index].x += vel[index].x * dt; pos[index].y += vel[index].y * dt; pos[index].z += vel[index].z * dt; @@ -191,7 +361,8 @@ void sendToPBO(int N, glm::vec4 * pos, float4 * pbo, int width, int height, floa if(x>>(1, numObjects, dev_pos, scene_scale, planetMass); checkCUDAErrorWithLine("Kernel failed!"); @@ -219,10 +396,11 @@ void initCuda(int N) checkCUDAErrorWithLine("Kernel failed!"); } -void cudaNBodyUpdateWrapper(float dt) +void cudaNBodyUpdateWrapper(float dt, glm::vec3 goal_position ) { dim3 fullBlocksPerGrid((int)ceil(float(numObjects)/float(blockSize))); - update<<>>(numObjects, dt, dev_pos, dev_vel); + updateF<<>>(numObjects, dt, dev_pos, dev_vel, dev_acc); + updateS<<>>(numObjects, dt, dev_pos, dev_vel, dev_acc, goal_position); checkCUDAErrorWithLine("Kernel failed!"); } diff --git a/Part1/src/kernel.h b/Part1/src/kernel.h index 1f8b37a..44caeba 100644 --- a/Part1/src/kernel.h +++ b/Part1/src/kernel.h @@ -5,6 +5,7 @@ #include #include #include +#include "glm/glm.hpp" #if CUDA_VERSION >= 5000 #include @@ -12,12 +13,12 @@ #include #endif -#define blockSize 128 +#define blockSize 256 #define checkCUDAErrorWithLine(msg) checkCUDAError(msg, __LINE__) -#define SHARED 0 +#define SHARED 1 void checkCUDAError(const char *msg, int line); -void cudaNBodyUpdateWrapper(float dt); +void cudaNBodyUpdateWrapper(float dt, glm::vec3 goal_position ); void initCuda(int N); void cudaUpdatePBO(float4 * pbodptr, int width, int height); void cudaUpdateVBO(float * vbodptr, int width, int height); diff --git a/Part1/src/main.cpp b/Part1/src/main.cpp index d4c9c5b..df38cf1 100644 --- a/Part1/src/main.cpp +++ b/Part1/src/main.cpp @@ -4,13 +4,14 @@ #include "main.h" -#define N_FOR_VIS 25 -#define DT 0.2 +#define N_FOR_VIS 8*1024 +#define DT 0.05 #define VISUALIZE 1 //------------------------------- //-------------MAIN-------------- //------------------------------- + int main(int argc, char** argv) { // Launch CUDA/GL @@ -43,7 +44,7 @@ int main(int argc, char** argv) glutDisplayFunc(display); glutKeyboardFunc(keyboard); - + glutMainLoop(); return 0; @@ -64,7 +65,7 @@ void runCuda() cudaGLMapBufferObject((void**)&dptrvert, planetVBO); // execute the kernel - cudaNBodyUpdateWrapper(DT); + cudaNBodyUpdateWrapper(DT, goal_position); #if VISUALIZE == 1 cudaUpdatePBO(dptr, field_width, field_height); cudaUpdateVBO(dptrvert, field_width, field_height); @@ -88,6 +89,7 @@ void display() timebase = time; frame = 0; } + runCuda(); char title[100]; @@ -152,6 +154,19 @@ void keyboard(unsigned char key, int x, int y) case(27): exit(1); break; + + case('w'): + goal_position.y += 0.1; + break; + case('s'): + goal_position.y -= 0.1; + break; + case('a'): + goal_position.x -= 0.1; + break; + case('d'): + goal_position.x += 0.1; + break; } } @@ -353,5 +368,7 @@ void deleteTexture(GLuint* tex) void shut_down(int return_code) { - exit(return_code); + printf( "Calling cudaDeviceReset \n" ); + cudaDeviceReset(); + //exit(return_code); } diff --git a/Part1/src/main.h b/Part1/src/main.h index 2b818bf..b0296c5 100644 --- a/Part1/src/main.h +++ b/Part1/src/main.h @@ -58,6 +58,9 @@ float zFar = 5.0; glm::mat4 projection; glm::mat4 view; glm::vec3 cameraPosition(1.75,1.75,1.35); + +glm::vec3 goal_position( 0.0, 0.0, 0.0 ); + //------------------------------- //----------CUDA STUFF----------- //------------------------------- @@ -74,7 +77,7 @@ int main(int argc, char** argv); //---------RUNTIME STUFF--------- //------------------------------- -void runCuda(); +void runCuda( glm::vec3 goal_position ); void display(); void keyboard(unsigned char key, int x, int y); diff --git a/README.md b/README.md index e3122aa..1c1744f 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,164 @@ +![planets](Part1/resources/planets.png) + +--- CIS565: Project 3: CUDA Simulation and GLSL Visualization +--- + +After completing the initial simulation requirements I chose to implement flocking. +Each particle has 3 behaviors: +Alignment - the particle velocity converges to the average velocity of its neighbors +Cohesion - the particle is attracted to the center of the flock +Seperation - particles repel from other particles nearby so they don't collide. + +I chose to have a radius of interaction for each behavior which allows for various +overall flock behaviors. For instance a small cohesion radius generally leads to +particles bunching into multiple smaller flocks while a large radius will lead the +particles to all converge to the global flock center. + +To keep the particles from leaving the viewing region I added a control point that +each particle is drawn towards and to make things interesting I added a rotational +velocity component to this behavior. The control point may be moved around interactively +using the 'wasd' keys. + +Showing the particles flocking. +![flocking](Part1/resources/particles_flocking.png) + +Showing particles in a rest state ( no rotational velocity component added ). +![flocking](Part1/resources/particle_flock_no_spin.png) + +--- +Video +--- +I posted a video of flocking up on youtube, it gets a little bit choppy due to my screencapture +software ( I'm working on putting up a better one ), but it gets the point across. + +http://youtu.be/GNqQbQlhye4 + +--- +Performance: Shared memory +--- +For the N-Body forces calculation its recommended to load particle states into +shared memory for each block. I varied the block size for shared and non-shared +memory and saw that performance went up somewhat from 64 to 512. And then fixing +the blocksize at 512 ( maximum performance ) I evaluated number of particles +versus framerate and surprisingly didn't find a large performance difference +between my shared memory implementation and the naive implementation, which +is surprising ( I'm still investigating this ). + +Note: I removed the ACC calculation for the grid for the test, allowing me +to observe the absolute number of particles that can be handled. + +particles | framerate + +BlockSize: 64 + +no shared memory: + +- 1024 | 38.46 +- 2048 | 33.53 +- 4096 | 20.37 +- 8192 | 7.98 +- 16384 | 2.30 +- 24576 | 1.06 + +shared memory: + +- 1024 | 39.68 +- 2048 | 33.80 +- 4096 | 21.32 +- 8192 | 8.64 +- 16384 | 2.57 +- 24576 | 1.19 + +BlockSize: 256 + +shared memory: + +- 1024 | 39.96 +- 2048 | 33.90 +- 4096 | 21.05 +- 8192 | 8.56 +- 16384 | 2.53 +- 24576 | 1.17 + +no shared memory: +- 1024 | 39.96 +- 2048 | 34.25 +- 4096 | 21.26 +- 8192 | 9.06 +- 16384 | 2.72 +- 24576 | 1.26 + +BlockSize: 512 + +no shared memory: + +- 64 | 38.31 +- 128 | 37.11 +- 256 | 37.00 +- 512 | 37.55 +- 1024 | 37.51 +- 2048 | 33.40 +- 4096 | 21.19 +- 8192 | 8.23 +- 16384 | 2.50 +- 24576 | 1.16 + +shared memory: + +- 1024 | 38.77 +- 2048 | 32.80 +- 4096 | 21.48 +- 8192 | 8.90 +- 16384 | 2.67 +- 24576 | 1.124 + +BlockSize: 1024 + +no shared memory: +- 1024 | 37.51 +- 2048 | 33.40 +- 4096 | 21.19 +- 8192 | 8.23 +- 16384 | 2.50 +- 24576 | 1.16 + +shared memory: +- 1024 | 38.58 +- 2048 | 33.30 +- 4096 | 21.13 +- 8192 | 8.73 +- 16384 | 2.58 +- 24576 | 1.19 + + +Generally blocksize does not have a large effect on framerate. +There appears to be a few percent improvement when using shared memory +over the naive implementation. The following is the specs for my GPU +if anyone is interested. + +``` +GPU Info: +GeForce 610M +Total amount of global memory: 1024 MBytes (1073414144 bytes) +( 1) Multiprocessors, ( 48) CUDA Cores/MP: 48 CUDA Cores +GPU Clock rate: 1344 MHz (1.34 GHz) +Memory Clock rate: 800 Mhz +Memory Bus Width: 64-bit +Maximum Texture Dimension Size (x,y,z) 1D=(65536), 2D=(65536, 65535), 3D=(2048, 2048, 2048) +Maximum Layered 1D Texture Size, (num) layers 1D=(16384), 2048 layers +Maximum Layered 2D Texture Size, (num) layers 2D=(16384, 16384), 2048 layers +Total amount of constant memory: 65536 bytes +Total amount of shared memory per block: 49152 bytes +Total number of registers available per block: 32768 +Warp size: 32 +Maximum number of threads per multiprocessor: 1536 +Maximum number of threads per block: 1024 +Max dimension size of a thread block (x,y,z): (1024, 1024, 64) +Max dimension size of a grid size (x,y,z): (65535, 65535, 65535) +Maximum memory pitch: 2147483647 bytes +``` + === Fall 2013 --- @@ -15,6 +175,9 @@ Lab or in Moore100 labs. All machines in the SIG Lab and Moore100 are equipped with CUDA capable NVIDIA graphics cards. If this too proves to be a problem, please contact Patrick or Liam as soon as possible. + + + --- INTRODUCTION: ---