diff --git a/Part1/.ycm_extra_conf.py b/Part1/.ycm_extra_conf.py
new file mode 100644
index 0000000..70537d7
--- /dev/null
+++ b/Part1/.ycm_extra_conf.py
@@ -0,0 +1,139 @@
+# This file is NOT licensed under the GPLv3, which is the license for the rest
+# of YouCompleteMe.
+#
+# Here's the license text for this file:
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+#
+# For more information, please refer to
+
+import os
+import ycm_core
+
+# These are the compilation flags that will be used in case there's no
+# compilation database set (by default, one is not set).
+# CHANGE THIS LIST OF FLAGS. YES, THIS IS THE DROID YOU HAVE BEEN LOOKING FOR.
+flags = [
+'-lglut',
+'-lGL',
+'-lGLEW',
+'-Wall',
+'-Wextra',
+'-Werror',
+'-Wc++98-compat',
+'-Wno-long-long',
+'-Wno-variadic-macros',
+'-fexceptions',
+'-DNDEBUG',
+'-DUSE_CLANG_COMPLETER',
+# THIS IS IMPORTANT! Without a "-std=" flag, clang won't know which
+# language to use when compiling headers. So it will guess. Badly. So C++
+# headers will be compiled as C headers. You don't want that so ALWAYS specify
+# a "-std=".
+# For a C project, you would set this to something like 'c99' instead of
+# 'c++11'.
+'-std=c++11',
+# ...and the same thing goes for the magic -x option which specifies the
+# language that the files to be compiled are written in. This is mostly
+# relevant for c++ headers.
+# For a C project, you would set this to 'c' instead of 'c++'.
+'-x',
+'c++',
+'-I/usr/local/cuda/samples/common/inc',
+'-I/usr/local/cuda/include'
+]
+
+# Set this to the absolute path to the folder (NOT the file!) containing the
+# compile_commands.json file to use that instead of 'flags'. See here for
+# more details: http://clang.llvm.org/docs/JSONCompilationDatabase.html
+#
+# Most projects will NOT need to set this to anything; you can just change the
+# 'flags' list of compilation flags. Notice that YCM itself uses that approach.
+compilation_database_folder = ''
+
+if compilation_database_folder:
+ database = ycm_core.CompilationDatabase( compilation_database_folder )
+else:
+ database = None
+
+
+def DirectoryOfThisScript():
+ return os.path.dirname( os.path.abspath( __file__ ) )
+
+
+def MakeRelativePathsInFlagsAbsolute( flags, working_directory ):
+ if not working_directory:
+ return list( flags )
+ new_flags = []
+ make_next_absolute = False
+ path_flags = [ '-isystem', '-I', '-iquote', '--sysroot=' ]
+ for flag in flags:
+ new_flag = flag
+
+ if make_next_absolute:
+ make_next_absolute = False
+ if not flag.startswith( '/' ):
+ new_flag = os.path.join( working_directory, flag )
+
+ for path_flag in path_flags:
+ if flag == path_flag:
+ make_next_absolute = True
+ break
+
+ if flag.startswith( path_flag ):
+ path = flag[ len( path_flag ): ]
+ new_flag = path_flag + os.path.join( working_directory, path )
+ break
+
+ if new_flag:
+ new_flags.append( new_flag )
+ return new_flags
+
+
+def FlagsForFile( filename ):
+ if database:
+ # Bear in mind that compilation_info.compiler_flags_ does NOT return a
+ # python list, but a "list-like" StringVec object
+ compilation_info = database.GetCompilationInfoForFile( filename )
+ final_flags = MakeRelativePathsInFlagsAbsolute(
+ compilation_info.compiler_flags_,
+ compilation_info.compiler_working_dir_ )
+
+ # NOTE: This is just for YouCompleteMe; it's highly likely that your project
+ # does NOT need to remove the stdlib flag. DO NOT USE THIS IN YOUR
+ # ycm_extra_conf IF YOU'RE NOT 100% YOU NEED IT.
+ '''
+ try:
+ final_flags.remove( '-stdlib=libc++' )
+ except ValueError:
+ pass
+ '''
+ else:
+ relative_to = DirectoryOfThisScript()
+ final_flags = MakeRelativePathsInFlagsAbsolute( flags, relative_to )
+
+ return {
+ 'flags': final_flags,
+ 'do_cache': True
+ }
diff --git a/Part1/PROJ_NIX/shaders/heightFS.glsl b/Part1/PROJ_NIX/shaders/heightFS.glsl
index e36d53e..4d948ec 100644
--- a/Part1/PROJ_NIX/shaders/heightFS.glsl
+++ b/Part1/PROJ_NIX/shaders/heightFS.glsl
@@ -1,4 +1,19 @@
+/*
void main(void)
{
gl_FragColor = vec4(0.05,0.15,0.3,1.0);
}
+*/
+
+varying vec2 v_Texcoords;
+varying float f_height;
+
+void main(void)
+{
+ float shade = (1.0-2.0*sqrt(f_height));
+ float alpha = float(mod(v_Texcoords.x+0.025, 0.05) > 0.046 || mod(v_Texcoords.y+0.025, 0.05) > 0.046);
+ vec4 color = mix(vec4(0.05,0.15,0.3,1.0), vec4(0.05, 0.3, 0.4, 1.0), alpha);
+
+ //vec4 color = vec4(0.05,0.15,0.3,1.0);
+ gl_FragColor = shade*color;
+}
diff --git a/Part1/PROJ_NIX/shaders/heightVS.glsl b/Part1/PROJ_NIX/shaders/heightVS.glsl
index eda1b93..032f050 100644
--- a/Part1/PROJ_NIX/shaders/heightVS.glsl
+++ b/Part1/PROJ_NIX/shaders/heightVS.glsl
@@ -1,3 +1,4 @@
+/*
uniform mat4 u_projMatrix;
attribute vec4 Position;
@@ -7,3 +8,23 @@ void main(void)
pos.z += 0.01;
gl_Position = pos;
}
+*/
+
+uniform mat4 u_projMatrix;
+uniform sampler2D u_height;
+
+attribute vec4 Position;
+attribute vec2 Texcoords;
+
+varying vec2 v_Texcoords;
+varying float f_height;
+
+void main(void)
+{
+ v_Texcoords = Texcoords;
+ vec4 pos = Position;
+ f_height = texture2D(u_height, Texcoords).w;
+ pos.z = -0.01-clamp(f_height,0.0,2.0);
+ pos = u_projMatrix * pos;
+ gl_Position = pos;
+}
diff --git a/Part1/PROJ_NIX/shaders/planetFS.glsl b/Part1/PROJ_NIX/shaders/planetFS.glsl
index e2c1350..9752dc7 100644
--- a/Part1/PROJ_NIX/shaders/planetFS.glsl
+++ b/Part1/PROJ_NIX/shaders/planetFS.glsl
@@ -1,4 +1,36 @@
+/*
void main(void)
{
gl_FragColor = vec4(1.0);
}
+*/
+
+#version 330
+
+in vec3 WorldCoord;
+in vec3 ToCam;
+in vec3 Up;
+in vec3 Right;
+in vec2 TexCoord;
+out vec4 FragColor;
+
+void main()
+{
+ vec2 coord = 2.01 * (TexCoord - vec2(0.5));
+ float r = length(coord);
+ if (r >= 1.0) { discard; }
+
+ float dist = length(WorldCoord);
+ if(dist <= 0.01)
+ {
+ FragColor = vec4(1.0);
+ return;
+ }
+
+ vec3 N = Right*-coord.x + Up*coord.y + ToCam*sqrt(1-r*r);
+ vec3 L = normalize(-WorldCoord);
+ float light = 0.1 + 0.9*clamp(dot(N,L),0.0, 1.0)*exp(-dist);
+ //vec3 color = vec3(0.4, 0.1, 0.6);
+ vec3 color = vec3(1.0, 0.8, 1.0);
+ FragColor = vec4(color*light,1.0);
+}
diff --git a/Part1/PROJ_NIX/shaders/planetGS.glsl b/Part1/PROJ_NIX/shaders/planetGS.glsl
index 88027d3..8f841c2 100644
--- a/Part1/PROJ_NIX/shaders/planetGS.glsl
+++ b/Part1/PROJ_NIX/shaders/planetGS.glsl
@@ -1,5 +1,5 @@
#version 330
-
+/*
uniform mat4 u_projMatrix;
layout (points) in;
@@ -13,3 +13,54 @@ void main()
EmitVertex();
EndPrimitive();
}
+
+*/
+
+uniform mat4 u_projMatrix;
+uniform vec3 u_cameraPos;
+
+layout (points) in;
+layout (triangle_strip) out;
+layout (max_vertices = 4) out;
+
+out vec3 WorldCoord;
+out vec3 ToCam;
+out vec3 Up;
+out vec3 Right;
+out vec2 TexCoord;
+
+//const float scale = 0.03;
+const float scale = 0.01;
+
+void main()
+{
+ vec3 Position = gl_in[0].gl_Position.xyz;
+ WorldCoord = Position;
+
+ ToCam = normalize(u_cameraPos - Position);
+ Up = vec3(0.0, 0.0, 1.0);
+ Right = cross(ToCam, Up);
+ Up = cross(Right, ToCam);
+
+ vec3 Pos = Position + scale*Right - scale*Up;
+ gl_Position = u_projMatrix * vec4(Pos, 1.0);
+ TexCoord = vec2(0.0, 0.0);
+ EmitVertex();
+
+ Pos = Position + scale*Right + scale*Up;
+ gl_Position = u_projMatrix * vec4(Pos, 1.0);
+ TexCoord = vec2(0.0, 1.0);
+ EmitVertex();
+
+ Pos = Position - scale*Right - scale*Up;
+ gl_Position = u_projMatrix * vec4(Pos, 1.0);
+ TexCoord = vec2(1.0, 0.0);
+ EmitVertex();
+
+ Pos = Position - scale*Right + scale*Up;
+ gl_Position = u_projMatrix * vec4(Pos, 1.0);
+ TexCoord = vec2(1.0, 1.0);
+ EmitVertex();
+
+ EndPrimitive();
+}
diff --git a/Part1/resources/particle_flock_no_spin.png b/Part1/resources/particle_flock_no_spin.png
new file mode 100644
index 0000000..29453a2
Binary files /dev/null and b/Part1/resources/particle_flock_no_spin.png differ
diff --git a/Part1/resources/particles_flocking.png b/Part1/resources/particles_flocking.png
new file mode 100644
index 0000000..c770426
Binary files /dev/null and b/Part1/resources/particles_flocking.png differ
diff --git a/Part1/resources/planets.png b/Part1/resources/planets.png
new file mode 100644
index 0000000..75236e5
Binary files /dev/null and b/Part1/resources/planets.png differ
diff --git a/Part1/src/kernel.cu b/Part1/src/kernel.cu
index 32b3cb1..21aafd4 100644
--- a/Part1/src/kernel.cu
+++ b/Part1/src/kernel.cu
@@ -5,23 +5,26 @@
#include "utilities.h"
#include "kernel.h"
-#if SHARED == 1
+#if SHARED == 1
#define ACC(x,y,z) sharedMemAcc(x,y,z)
#else
#define ACC(x,y,z) naiveAcc(x,y,z)
#endif
+#define FLOCKING 0
+
//GLOBALS
dim3 threadsPerBlock(blockSize);
int numObjects;
const float planetMass = 3e8;
-const __device__ float starMass = 5e10;
+const __device__ float starMass = 5e9;
-const float scene_scale = 2e2; //size of the height map in simulation space
+const float scene_scale = 2e1; //size of the height map in simulation space
glm::vec4 * dev_pos;
glm::vec3 * dev_vel;
+glm::vec3 * dev_acc;
void checkCUDAError(const char *msg, int line = -1)
{
@@ -69,7 +72,8 @@ void generateRandomPosArray(int time, int N, glm::vec4 * arr, float scale, float
glm::vec3 rand = scale*(generateRandomNumberFromThread(time, index)-0.5f);
arr[index].x = rand.x;
arr[index].y = rand.y;
- arr[index].z = 0.0f;//rand.z;
+ //arr[index].z = 0.0f;//rand.z;
+ arr[index].z = rand.z;
arr[index].w = mass;
}
}
@@ -85,7 +89,7 @@ void generateCircularVelArray(int time, int N, glm::vec3 * arr, glm::vec4 * pos)
glm::vec3 R = glm::vec3(pos[index].x, pos[index].y, pos[index].z);
float r = glm::length(R) + EPSILON;
float s = sqrt(G*starMass/r);
- glm::vec3 D = glm::normalize(glm::cross(R/r,glm::vec3(0,0,1)));
+ glm::vec3 D = 10.0f*glm::normalize(glm::cross(R/r,glm::vec3(0,0,1)));
arr[index].x = s*D.x;
arr[index].y = s*D.y;
arr[index].z = s*D.z;
@@ -102,11 +106,12 @@ void generateRandomVelArray(int time, int N, glm::vec3 * arr, float scale)
glm::vec3 rand = scale*(generateRandomNumberFromThread(time, index) - 0.5f);
arr[index].x = rand.x;
arr[index].y = rand.y;
- arr[index].z = 0.0;//rand.z;
+ arr[index].z = 0.0;
+ //arr[index].z = rand.z;
}
}
-//TODO: Determine force between two bodies
+//DONE: Determine force between two bodies
__device__
glm::vec3 calculateAcceleration(glm::vec4 us, glm::vec4 them)
{
@@ -117,38 +122,203 @@ glm::vec3 calculateAcceleration(glm::vec4 us, glm::vec4 them)
// G*m_us*m_them G*m_them
//a = ------------- = --------
// m_us*r^2 r^2
-
- return glm::vec3(0.0f);
+
+ float m_them = them.w;
+
+ glm::vec3 d = glm::vec3(us.x, us.y, us.z) - glm::vec3(them.x, them.y, them.z);
+ float r2 = glm::dot( d, d );
+
+ // EPSILON softening-factor
+ float a = -G*m_them/(r2 + 1e-1);
+ return a*glm::normalize( d );
}
-//TODO: Core force calc kernel global memory
+//DONE: Core force calc kernel global memory
__device__
glm::vec3 naiveAcc(int N, glm::vec4 my_pos, glm::vec4 * their_pos)
{
+ int index = threadIdx.x + (blockIdx.x * blockDim.x);
+ // Calculate acceleration from star
glm::vec3 acc = calculateAcceleration(my_pos, glm::vec4(0,0,0,starMass));
+ // Calculate accelerations from other planets
+ for ( int i=0; i < N; ++i ) {
+ if ( i == index )
+ continue;
+ acc += calculateAcceleration(my_pos, their_pos[i]);
+ }
return acc;
}
-//TODO: Core force calc kernel shared memory
+//DONE: Core force calc kernel shared memory
__device__
glm::vec3 sharedMemAcc(int N, glm::vec4 my_pos, glm::vec4 * their_pos)
{
+ __shared__ glm::vec4 shared_their_pos[blockSize];
glm::vec3 acc = calculateAcceleration(my_pos, glm::vec4(0,0,0,starMass));
+
+ int index = threadIdx.x + (blockIdx.x * blockDim.x);
+ // Copy a segment of positions from global to shared memory
+ int num_iter = 0;
+ for ( int i = 0; i < N; i += blockDim.x ) {
+ // Compute global memory index to pull in
+ int gbl_index = threadIdx.x + (num_iter * blockDim.x);
+ num_iter++;
+
+ shared_their_pos[threadIdx.x] = their_pos[gbl_index];
+ // Don't forget to sync after the copy
+ __syncthreads();
+
+
+ // Calculate accelerations from other planets using from shared mem
+ for ( int j=0; j < blockDim.x; j++ ) {
+ if ( i+j != index )
+ acc += calculateAcceleration(my_pos, shared_their_pos[j]);
+ }
+ // Sync before next copy
+ __syncthreads();
+ }
return acc;
+ //return calculateAcceleration(my_pos, glm::vec4(0,0,0,starMass));
}
+//DONE
+__device__
+glm::vec3 Alignment( int N, glm::vec4 my_pos, glm::vec4* pos, glm::vec3* vel )
+{
+ glm::vec3 ave_vel;
+ float r2;
+ glm::vec3 d;
+ int index = threadIdx.x + (blockIdx.x * blockDim.x);
+ // Compute average velocity
+ // Compute average position
+ int cnt;
+ for ( int i=0; i < N; ++i ) {
+ if ( i == index )
+ continue;
+ d = glm::vec3( pos[i].x-my_pos.x, pos[i].y-my_pos.y, pos[i].z-my_pos.z);
+ r2 = glm::dot( d, d );
+ if ( r2 < 5.0 ) {
+ ave_vel += glm::vec3( vel[i].x, vel[i].y, vel[i].z );
+ cnt ++;
+ }
+ }
+ ave_vel= ave_vel/float(cnt);
+
+ return ave_vel;
+ //return glm::vec3(0.0, 0.0, 0.0);
+}
+
+//DONE
+__device__
+glm::vec3 Cohesion( int N, glm::vec4 my_pos, glm::vec4* pos)
+{
+ glm::vec3 ave_pos;
+ float r2;
+ glm::vec3 d;
+ int index = threadIdx.x + (blockIdx.x * blockDim.x);
+ int cnt = 0;
+ // Compute average position weighted by distance
+ for ( int i=0; i < N; ++i ) {
+ if ( i == index )
+ continue;
+ d = glm::vec3( pos[i].x-my_pos.x, pos[i].y-my_pos.y, pos[i].z-my_pos.z);
+ r2 = glm::dot( d, d );
+ if ( r2 < 5.0 ) {
+ cnt++;
+ ave_pos += glm::vec3( pos[i].x, pos[i].y, pos[i].z );
+ }
+ }
+ ave_pos = ave_pos/float(cnt);
+
+ d = glm::vec3(ave_pos.x-my_pos.x, ave_pos.y-my_pos.y, ave_pos.z-my_pos.z);
+
+ //float r = glm::length(d) + EPSILON;
+ //float s = sqrt(1.0f/r);
+ //glm::vec3 D = glm::normalize(glm::cross(d/r,glm::vec3(0,0,1)));
+
+ //return s*D;
+
+ return glm::normalize(d);
+}
+
+//DONE
+__device__
+glm::vec3 Seperation( int N, glm::vec4 my_pos, glm::vec4* pos )
+{
+
+ int index = threadIdx.x + (blockIdx.x * blockDim.x);
+
+ glm::vec3 acc;
+ glm::vec3 d;
+ float r2;
+ // Compute repulsion force
+ for ( int i=0; i < N; ++i ) {
+ if ( i == index )
+ continue;
+ d = glm::vec3( pos[i].x-my_pos.x, pos[i].y-my_pos.y, pos[i].z-my_pos.z);
+ r2 = glm::dot( d, d );
+ if ( r2 < 1.0 )
+ acc += -glm::normalize(d);
+ }
+ return acc;
+}
+
+//Simple Euler integration scheme
+__global__
+void updateF(int N, float dt, glm::vec4 * pos, glm::vec3 * vel, glm::vec3 * acc)
+{
+ int index = threadIdx.x + (blockIdx.x * blockDim.x);
+ glm::vec4 my_pos;
+ glm::vec3 accel;
+
+ if(index < N) my_pos = pos[index];
+
+ accel = ACC(N, my_pos, pos);
+
+ if(index < N) acc[index] = accel;
+}
//Simple Euler integration scheme
__global__
-void update(int N, float dt, glm::vec4 * pos, glm::vec3 * vel)
+void updateS(int N, float dt, glm::vec4 * pos, glm::vec3 * vel, glm::vec3 * acc, glm::vec3 star_position )
{
int index = threadIdx.x + (blockIdx.x * blockDim.x);
if( index < N )
{
glm::vec4 my_pos = pos[index];
- glm::vec3 acc = ACC(N, my_pos, pos);
- vel[index] += acc * dt;
+
+ #if FLOCKING == 0
+ //glm::vec3 acc = ACC(N, my_pos, pos);
+ vel[index] += acc[index] * dt;
+ #else
+ // Align velocity with flock average
+ glm::vec3 align_vel = Alignment( N, my_pos, pos, vel );
+ // Attract towards flock average
+ glm::vec3 cohesion_vel = Cohesion( N, my_pos, pos );
+ // Repel from nearby objects
+ glm::vec3 seperation_vel = Seperation( N, my_pos, pos );
+
+ glm::vec3 weights = glm::vec3( 0.8, 0.2, 0.5 );
+ // Need some weights
+ vel[index] = weights.x*align_vel + weights.y*cohesion_vel + weights.z*seperation_vel;
+
+ // Add in circular velocity around star
+ //glm::vec3 star_position( 1.0, 0.0, 0.0 );
+ glm::vec3 R = glm::vec3(pos[index].x-star_position.x, pos[index].y-star_position.y, pos[index].z-star_position.z);
+ float r = glm::length(R) + EPSILON;
+ float s = sqrt(G*starMass/r);
+ glm::vec3 D = glm::normalize(glm::cross(R/r,glm::vec3(0,0,1)));
+ vel[index] += 1.0f*D;
+
+ // Add in attractive velocity toward star
+ vel[index] += -0.01f*R;
+
+ // Add in damping
+ //vel[index] *= 0.7f;
+
+ #endif
+
pos[index].x += vel[index].x * dt;
pos[index].y += vel[index].y * dt;
pos[index].z += vel[index].z * dt;
@@ -191,7 +361,8 @@ void sendToPBO(int N, glm::vec4 * pos, float4 * pbo, int width, int height, floa
if(x>>(1, numObjects, dev_pos, scene_scale, planetMass);
checkCUDAErrorWithLine("Kernel failed!");
@@ -219,10 +396,11 @@ void initCuda(int N)
checkCUDAErrorWithLine("Kernel failed!");
}
-void cudaNBodyUpdateWrapper(float dt)
+void cudaNBodyUpdateWrapper(float dt, glm::vec3 goal_position )
{
dim3 fullBlocksPerGrid((int)ceil(float(numObjects)/float(blockSize)));
- update<<>>(numObjects, dt, dev_pos, dev_vel);
+ updateF<<>>(numObjects, dt, dev_pos, dev_vel, dev_acc);
+ updateS<<>>(numObjects, dt, dev_pos, dev_vel, dev_acc, goal_position);
checkCUDAErrorWithLine("Kernel failed!");
}
diff --git a/Part1/src/kernel.h b/Part1/src/kernel.h
index 1f8b37a..44caeba 100644
--- a/Part1/src/kernel.h
+++ b/Part1/src/kernel.h
@@ -5,6 +5,7 @@
#include
#include
#include
+#include "glm/glm.hpp"
#if CUDA_VERSION >= 5000
#include
@@ -12,12 +13,12 @@
#include
#endif
-#define blockSize 128
+#define blockSize 256
#define checkCUDAErrorWithLine(msg) checkCUDAError(msg, __LINE__)
-#define SHARED 0
+#define SHARED 1
void checkCUDAError(const char *msg, int line);
-void cudaNBodyUpdateWrapper(float dt);
+void cudaNBodyUpdateWrapper(float dt, glm::vec3 goal_position );
void initCuda(int N);
void cudaUpdatePBO(float4 * pbodptr, int width, int height);
void cudaUpdateVBO(float * vbodptr, int width, int height);
diff --git a/Part1/src/main.cpp b/Part1/src/main.cpp
index d4c9c5b..df38cf1 100644
--- a/Part1/src/main.cpp
+++ b/Part1/src/main.cpp
@@ -4,13 +4,14 @@
#include "main.h"
-#define N_FOR_VIS 25
-#define DT 0.2
+#define N_FOR_VIS 8*1024
+#define DT 0.05
#define VISUALIZE 1
//-------------------------------
//-------------MAIN--------------
//-------------------------------
+
int main(int argc, char** argv)
{
// Launch CUDA/GL
@@ -43,7 +44,7 @@ int main(int argc, char** argv)
glutDisplayFunc(display);
glutKeyboardFunc(keyboard);
-
+
glutMainLoop();
return 0;
@@ -64,7 +65,7 @@ void runCuda()
cudaGLMapBufferObject((void**)&dptrvert, planetVBO);
// execute the kernel
- cudaNBodyUpdateWrapper(DT);
+ cudaNBodyUpdateWrapper(DT, goal_position);
#if VISUALIZE == 1
cudaUpdatePBO(dptr, field_width, field_height);
cudaUpdateVBO(dptrvert, field_width, field_height);
@@ -88,6 +89,7 @@ void display()
timebase = time;
frame = 0;
}
+
runCuda();
char title[100];
@@ -152,6 +154,19 @@ void keyboard(unsigned char key, int x, int y)
case(27):
exit(1);
break;
+
+ case('w'):
+ goal_position.y += 0.1;
+ break;
+ case('s'):
+ goal_position.y -= 0.1;
+ break;
+ case('a'):
+ goal_position.x -= 0.1;
+ break;
+ case('d'):
+ goal_position.x += 0.1;
+ break;
}
}
@@ -353,5 +368,7 @@ void deleteTexture(GLuint* tex)
void shut_down(int return_code)
{
- exit(return_code);
+ printf( "Calling cudaDeviceReset \n" );
+ cudaDeviceReset();
+ //exit(return_code);
}
diff --git a/Part1/src/main.h b/Part1/src/main.h
index 2b818bf..b0296c5 100644
--- a/Part1/src/main.h
+++ b/Part1/src/main.h
@@ -58,6 +58,9 @@ float zFar = 5.0;
glm::mat4 projection;
glm::mat4 view;
glm::vec3 cameraPosition(1.75,1.75,1.35);
+
+glm::vec3 goal_position( 0.0, 0.0, 0.0 );
+
//-------------------------------
//----------CUDA STUFF-----------
//-------------------------------
@@ -74,7 +77,7 @@ int main(int argc, char** argv);
//---------RUNTIME STUFF---------
//-------------------------------
-void runCuda();
+void runCuda( glm::vec3 goal_position );
void display();
void keyboard(unsigned char key, int x, int y);
diff --git a/README.md b/README.md
index e3122aa..1c1744f 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,164 @@
+
+
+---
CIS565: Project 3: CUDA Simulation and GLSL Visualization
+---
+
+After completing the initial simulation requirements I chose to implement flocking.
+Each particle has 3 behaviors:
+Alignment - the particle velocity converges to the average velocity of its neighbors
+Cohesion - the particle is attracted to the center of the flock
+Seperation - particles repel from other particles nearby so they don't collide.
+
+I chose to have a radius of interaction for each behavior which allows for various
+overall flock behaviors. For instance a small cohesion radius generally leads to
+particles bunching into multiple smaller flocks while a large radius will lead the
+particles to all converge to the global flock center.
+
+To keep the particles from leaving the viewing region I added a control point that
+each particle is drawn towards and to make things interesting I added a rotational
+velocity component to this behavior. The control point may be moved around interactively
+using the 'wasd' keys.
+
+Showing the particles flocking.
+
+
+Showing particles in a rest state ( no rotational velocity component added ).
+
+
+---
+Video
+---
+I posted a video of flocking up on youtube, it gets a little bit choppy due to my screencapture
+software ( I'm working on putting up a better one ), but it gets the point across.
+
+http://youtu.be/GNqQbQlhye4
+
+---
+Performance: Shared memory
+---
+For the N-Body forces calculation its recommended to load particle states into
+shared memory for each block. I varied the block size for shared and non-shared
+memory and saw that performance went up somewhat from 64 to 512. And then fixing
+the blocksize at 512 ( maximum performance ) I evaluated number of particles
+versus framerate and surprisingly didn't find a large performance difference
+between my shared memory implementation and the naive implementation, which
+is surprising ( I'm still investigating this ).
+
+Note: I removed the ACC calculation for the grid for the test, allowing me
+to observe the absolute number of particles that can be handled.
+
+particles | framerate
+
+BlockSize: 64
+
+no shared memory:
+
+- 1024 | 38.46
+- 2048 | 33.53
+- 4096 | 20.37
+- 8192 | 7.98
+- 16384 | 2.30
+- 24576 | 1.06
+
+shared memory:
+
+- 1024 | 39.68
+- 2048 | 33.80
+- 4096 | 21.32
+- 8192 | 8.64
+- 16384 | 2.57
+- 24576 | 1.19
+
+BlockSize: 256
+
+shared memory:
+
+- 1024 | 39.96
+- 2048 | 33.90
+- 4096 | 21.05
+- 8192 | 8.56
+- 16384 | 2.53
+- 24576 | 1.17
+
+no shared memory:
+- 1024 | 39.96
+- 2048 | 34.25
+- 4096 | 21.26
+- 8192 | 9.06
+- 16384 | 2.72
+- 24576 | 1.26
+
+BlockSize: 512
+
+no shared memory:
+
+- 64 | 38.31
+- 128 | 37.11
+- 256 | 37.00
+- 512 | 37.55
+- 1024 | 37.51
+- 2048 | 33.40
+- 4096 | 21.19
+- 8192 | 8.23
+- 16384 | 2.50
+- 24576 | 1.16
+
+shared memory:
+
+- 1024 | 38.77
+- 2048 | 32.80
+- 4096 | 21.48
+- 8192 | 8.90
+- 16384 | 2.67
+- 24576 | 1.124
+
+BlockSize: 1024
+
+no shared memory:
+- 1024 | 37.51
+- 2048 | 33.40
+- 4096 | 21.19
+- 8192 | 8.23
+- 16384 | 2.50
+- 24576 | 1.16
+
+shared memory:
+- 1024 | 38.58
+- 2048 | 33.30
+- 4096 | 21.13
+- 8192 | 8.73
+- 16384 | 2.58
+- 24576 | 1.19
+
+
+Generally blocksize does not have a large effect on framerate.
+There appears to be a few percent improvement when using shared memory
+over the naive implementation. The following is the specs for my GPU
+if anyone is interested.
+
+```
+GPU Info:
+GeForce 610M
+Total amount of global memory: 1024 MBytes (1073414144 bytes)
+( 1) Multiprocessors, ( 48) CUDA Cores/MP: 48 CUDA Cores
+GPU Clock rate: 1344 MHz (1.34 GHz)
+Memory Clock rate: 800 Mhz
+Memory Bus Width: 64-bit
+Maximum Texture Dimension Size (x,y,z) 1D=(65536), 2D=(65536, 65535), 3D=(2048, 2048, 2048)
+Maximum Layered 1D Texture Size, (num) layers 1D=(16384), 2048 layers
+Maximum Layered 2D Texture Size, (num) layers 2D=(16384, 16384), 2048 layers
+Total amount of constant memory: 65536 bytes
+Total amount of shared memory per block: 49152 bytes
+Total number of registers available per block: 32768
+Warp size: 32
+Maximum number of threads per multiprocessor: 1536
+Maximum number of threads per block: 1024
+Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
+Max dimension size of a grid size (x,y,z): (65535, 65535, 65535)
+Maximum memory pitch: 2147483647 bytes
+```
+
===
Fall 2013
---
@@ -15,6 +175,9 @@ Lab or in Moore100 labs. All machines in the SIG Lab and Moore100 are equipped
with CUDA capable NVIDIA graphics cards. If this too proves to be a problem,
please contact Patrick or Liam as soon as possible.
+
+
+
---
INTRODUCTION:
---