diff --git a/Part1/.ycm_extra_conf.py b/Part1/.ycm_extra_conf.py
new file mode 100644
index 0000000..70537d7
--- /dev/null
+++ b/Part1/.ycm_extra_conf.py
@@ -0,0 +1,139 @@
+# This file is NOT licensed under the GPLv3, which is the license for the rest
+# of YouCompleteMe.
+#
+# Here's the license text for this file:
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+#
+# For more information, please refer to <http://unlicense.org/>
+
+import os
+import ycm_core
+
+# These are the compilation flags that will be used in case there's no
+# compilation database set (by default, one is not set).
+# CHANGE THIS LIST OF FLAGS. YES, THIS IS THE DROID YOU HAVE BEEN LOOKING FOR.
+flags = [
+'-lglut',
+'-lGL',
+'-lGLEW',
+'-Wall',
+'-Wextra',
+'-Werror',
+'-Wc++98-compat',
+'-Wno-long-long',
+'-Wno-variadic-macros',
+'-fexceptions',
+'-DNDEBUG',
+'-DUSE_CLANG_COMPLETER',
+# THIS IS IMPORTANT! Without a "-std=<something>" flag, clang won't know which
+# language to use when compiling headers. So it will guess. Badly. So C++
+# headers will be compiled as C headers. You don't want that so ALWAYS specify
+# a "-std=<something>".
+# For a C project, you would set this to something like 'c99' instead of
+# 'c++11'.
+'-std=c++11',
+# ...and the same thing goes for the magic -x option which specifies the
+# language that the files to be compiled are written in. This is mostly
+# relevant for c++ headers.
+# For a C project, you would set this to 'c' instead of 'c++'.
+'-x',
+'c++',
+'-I/usr/local/cuda/samples/common/inc',
+'-I/usr/local/cuda/include'
+]
+
+# Set this to the absolute path to the folder (NOT the file!) containing the
+# compile_commands.json file to use that instead of 'flags'. See here for
+# more details: http://clang.llvm.org/docs/JSONCompilationDatabase.html
+#
+# Most projects will NOT need to set this to anything; you can just change the
+# 'flags' list of compilation flags. Notice that YCM itself uses that approach.
+compilation_database_folder = ''
+
+if compilation_database_folder:
+  database = ycm_core.CompilationDatabase( compilation_database_folder )
+else:
+  database = None
+
+
+def DirectoryOfThisScript():
+  return os.path.dirname( os.path.abspath( __file__ ) )
+
+
+def MakeRelativePathsInFlagsAbsolute( flags, working_directory ):
+  if not working_directory:
+    return list( flags )
+  new_flags = []
+  make_next_absolute = False
+  path_flags = [ '-isystem', '-I', '-iquote', '--sysroot=' ]
+  for flag in flags:
+    new_flag = flag
+
+    if make_next_absolute:
+      make_next_absolute = False
+      if not flag.startswith( '/' ):
+        new_flag = os.path.join( working_directory, flag )
+
+    for path_flag in path_flags:
+      if flag == path_flag:
+        make_next_absolute = True
+        break
+
+      if flag.startswith( path_flag ):
+        path = flag[ len( path_flag ): ]
+        new_flag = path_flag + os.path.join( working_directory, path )
+        break
+
+    if new_flag:
+      new_flags.append( new_flag )
+  return new_flags
+
+
+def FlagsForFile( filename ):
+  if database:
+    # Bear in mind that compilation_info.compiler_flags_ does NOT return a
+    # python list, but a "list-like" StringVec object
+    compilation_info = database.GetCompilationInfoForFile( filename )
+    final_flags = MakeRelativePathsInFlagsAbsolute(
+      compilation_info.compiler_flags_,
+      compilation_info.compiler_working_dir_ )
+
+    # NOTE: This is just for YouCompleteMe; it's highly likely that your project
+    # does NOT need to remove the stdlib flag. DO NOT USE THIS IN YOUR
+    # ycm_extra_conf IF YOU'RE NOT 100% YOU NEED IT.
+    '''
+    try:
+      final_flags.remove( '-stdlib=libc++' )
+    except ValueError:
+      pass
+    '''
+  else:
+    relative_to = DirectoryOfThisScript()
+    final_flags = MakeRelativePathsInFlagsAbsolute( flags, relative_to )
+
+  return {
+    'flags': final_flags,
+    'do_cache': True
+  }
diff --git a/Part1/PROJ_NIX/shaders/heightFS.glsl b/Part1/PROJ_NIX/shaders/heightFS.glsl
index e36d53e..4d948ec 100644
--- a/Part1/PROJ_NIX/shaders/heightFS.glsl
+++ b/Part1/PROJ_NIX/shaders/heightFS.glsl
@@ -1,4 +1,19 @@
+/*
 void main(void)
 {
 	gl_FragColor = vec4(0.05,0.15,0.3,1.0);
 }
+*/
+
+varying vec2 v_Texcoords;
+varying float f_height;
+
+void main(void)
+{
+    float shade = (1.0-2.0*sqrt(f_height));
+    float alpha = float(mod(v_Texcoords.x+0.025, 0.05) > 0.046 || mod(v_Texcoords.y+0.025, 0.05) > 0.046);
+    vec4 color = mix(vec4(0.05,0.15,0.3,1.0), vec4(0.05, 0.3, 0.4, 1.0), alpha);
+
+    //vec4 color = vec4(0.05,0.15,0.3,1.0); 
+    gl_FragColor = shade*color;
+}
diff --git a/Part1/PROJ_NIX/shaders/heightVS.glsl b/Part1/PROJ_NIX/shaders/heightVS.glsl
index eda1b93..032f050 100644
--- a/Part1/PROJ_NIX/shaders/heightVS.glsl
+++ b/Part1/PROJ_NIX/shaders/heightVS.glsl
@@ -1,3 +1,4 @@
+/*
 uniform mat4 u_projMatrix;
 attribute vec4 Position;
 
@@ -7,3 +8,23 @@ void main(void)
 	pos.z += 0.01;
 	gl_Position = pos;
 }
+*/
+
+uniform mat4 u_projMatrix;
+uniform sampler2D u_height;
+
+attribute vec4 Position;
+attribute vec2 Texcoords;
+
+varying vec2 v_Texcoords;
+varying float f_height;
+
+void main(void)
+{
+    v_Texcoords = Texcoords;
+    vec4 pos = Position;
+    f_height = texture2D(u_height, Texcoords).w;
+    pos.z = -0.01-clamp(f_height,0.0,2.0);
+    pos = u_projMatrix * pos;
+    gl_Position = pos;
+}
diff --git a/Part1/PROJ_NIX/shaders/planetFS.glsl b/Part1/PROJ_NIX/shaders/planetFS.glsl
index e2c1350..9752dc7 100644
--- a/Part1/PROJ_NIX/shaders/planetFS.glsl
+++ b/Part1/PROJ_NIX/shaders/planetFS.glsl
@@ -1,4 +1,36 @@
+/*
 void main(void)
 {
 	gl_FragColor = vec4(1.0);
 }
+*/
+
+#version 330
+
+in vec3 WorldCoord;
+in vec3 ToCam;
+in vec3 Up;
+in vec3 Right;
+in vec2 TexCoord;
+out vec4 FragColor;
+
+void main()
+{
+    vec2 coord = 2.01 * (TexCoord - vec2(0.5));
+    float r = length(coord);
+    if (r >= 1.0) { discard; }
+
+    float dist = length(WorldCoord);
+    if(dist <= 0.01)
+    {
+        FragColor = vec4(1.0);
+        return;
+    }
+
+    vec3 N = Right*-coord.x + Up*coord.y + ToCam*sqrt(1-r*r);
+    vec3 L = normalize(-WorldCoord);
+    float light = 0.1 + 0.9*clamp(dot(N,L),0.0, 1.0)*exp(-dist);
+    //vec3 color = vec3(0.4, 0.1, 0.6);
+    vec3 color = vec3(1.0, 0.8, 1.0);
+    FragColor = vec4(color*light,1.0);
+} 
diff --git a/Part1/PROJ_NIX/shaders/planetGS.glsl b/Part1/PROJ_NIX/shaders/planetGS.glsl
index 88027d3..8f841c2 100644
--- a/Part1/PROJ_NIX/shaders/planetGS.glsl
+++ b/Part1/PROJ_NIX/shaders/planetGS.glsl
@@ -1,5 +1,5 @@
 #version 330
-
+/*
 uniform mat4 u_projMatrix;
 
 layout (points) in;
@@ -13,3 +13,54 @@ void main()
     EmitVertex();
     EndPrimitive();
 }
+
+*/
+
+uniform mat4 u_projMatrix;
+uniform vec3 u_cameraPos;
+
+layout (points) in;
+layout (triangle_strip) out;
+layout (max_vertices = 4) out;
+
+out vec3 WorldCoord;
+out vec3 ToCam;
+out vec3 Up;
+out vec3 Right;
+out vec2 TexCoord;
+
+//const float scale = 0.03;
+const float scale = 0.01;
+
+void main()
+{
+    vec3 Position = gl_in[0].gl_Position.xyz;
+    WorldCoord = Position;
+
+    ToCam = normalize(u_cameraPos - Position);
+    Up = vec3(0.0, 0.0, 1.0);
+    Right = cross(ToCam, Up);
+    Up = cross(Right, ToCam);
+
+    vec3 Pos = Position + scale*Right - scale*Up;
+    gl_Position = u_projMatrix * vec4(Pos, 1.0);
+    TexCoord = vec2(0.0, 0.0);
+    EmitVertex();
+
+    Pos = Position + scale*Right + scale*Up;
+    gl_Position = u_projMatrix * vec4(Pos, 1.0);
+    TexCoord = vec2(0.0, 1.0);
+    EmitVertex();
+
+    Pos = Position - scale*Right - scale*Up;
+    gl_Position = u_projMatrix * vec4(Pos, 1.0);
+    TexCoord = vec2(1.0, 0.0);
+    EmitVertex();
+
+    Pos = Position - scale*Right + scale*Up;
+    gl_Position = u_projMatrix * vec4(Pos, 1.0);
+    TexCoord = vec2(1.0, 1.0);
+    EmitVertex();
+
+    EndPrimitive();
+}
diff --git a/Part1/resources/particle_flock_no_spin.png b/Part1/resources/particle_flock_no_spin.png
new file mode 100644
index 0000000..29453a2
Binary files /dev/null and b/Part1/resources/particle_flock_no_spin.png differ
diff --git a/Part1/resources/particles_flocking.png b/Part1/resources/particles_flocking.png
new file mode 100644
index 0000000..c770426
Binary files /dev/null and b/Part1/resources/particles_flocking.png differ
diff --git a/Part1/resources/planets.png b/Part1/resources/planets.png
new file mode 100644
index 0000000..75236e5
Binary files /dev/null and b/Part1/resources/planets.png differ
diff --git a/Part1/src/kernel.cu b/Part1/src/kernel.cu
index 32b3cb1..21aafd4 100644
--- a/Part1/src/kernel.cu
+++ b/Part1/src/kernel.cu
@@ -5,23 +5,26 @@
 #include "utilities.h"
 #include "kernel.h"
 
-#if SHARED == 1
+#if SHARED == 1 
     #define ACC(x,y,z) sharedMemAcc(x,y,z)
 #else
     #define ACC(x,y,z) naiveAcc(x,y,z)
 #endif
 
+#define FLOCKING 0
+
 //GLOBALS
 dim3 threadsPerBlock(blockSize);
 
 int numObjects;
 const float planetMass = 3e8;
-const __device__ float starMass = 5e10;
+const __device__ float starMass = 5e9;
 
-const float scene_scale = 2e2; //size of the height map in simulation space
+const float scene_scale = 2e1; //size of the height map in simulation space
 
 glm::vec4 * dev_pos;
 glm::vec3 * dev_vel;
+glm::vec3 * dev_acc;
 
 void checkCUDAError(const char *msg, int line = -1)
 {
@@ -69,7 +72,8 @@ void generateRandomPosArray(int time, int N, glm::vec4 * arr, float scale, float
         glm::vec3 rand = scale*(generateRandomNumberFromThread(time, index)-0.5f);
         arr[index].x = rand.x;
         arr[index].y = rand.y;
-        arr[index].z = 0.0f;//rand.z;
+        //arr[index].z = 0.0f;//rand.z;
+        arr[index].z = rand.z;
         arr[index].w = mass;
     }
 }
@@ -85,7 +89,7 @@ void generateCircularVelArray(int time, int N, glm::vec3 * arr, glm::vec4 * pos)
         glm::vec3 R = glm::vec3(pos[index].x, pos[index].y, pos[index].z);
         float r = glm::length(R) + EPSILON;
         float s = sqrt(G*starMass/r);
-        glm::vec3 D = glm::normalize(glm::cross(R/r,glm::vec3(0,0,1)));
+        glm::vec3 D = 10.0f*glm::normalize(glm::cross(R/r,glm::vec3(0,0,1)));
         arr[index].x = s*D.x;
         arr[index].y = s*D.y;
         arr[index].z = s*D.z;
@@ -102,11 +106,12 @@ void generateRandomVelArray(int time, int N, glm::vec3 * arr, float scale)
         glm::vec3 rand = scale*(generateRandomNumberFromThread(time, index) - 0.5f);
         arr[index].x = rand.x;
         arr[index].y = rand.y;
-        arr[index].z = 0.0;//rand.z;
+	arr[index].z = 0.0;
+        //arr[index].z = rand.z;
     }
 }
 
-//TODO: Determine force between two bodies
+//DONE: Determine force between two bodies
 __device__
 glm::vec3 calculateAcceleration(glm::vec4 us, glm::vec4 them)
 {
@@ -117,38 +122,203 @@ glm::vec3 calculateAcceleration(glm::vec4 us, glm::vec4 them)
     //    G*m_us*m_them   G*m_them
     //a = ------------- = --------
     //      m_us*r^2        r^2
-    
-    return glm::vec3(0.0f);
+
+    float m_them = them.w; 
+   
+    glm::vec3 d = glm::vec3(us.x, us.y, us.z) - glm::vec3(them.x, them.y, them.z); 
+    float r2 = glm::dot( d, d );
+
+    // EPSILON softening-factor
+    float a = -G*m_them/(r2 + 1e-1);
+    return a*glm::normalize( d );
 }
 
-//TODO: Core force calc kernel global memory
+//DONE: Core force calc kernel global memory
 __device__ 
 glm::vec3 naiveAcc(int N, glm::vec4 my_pos, glm::vec4 * their_pos)
 {
+    int index = threadIdx.x + (blockIdx.x * blockDim.x);
+    // Calculate acceleration from star
     glm::vec3 acc = calculateAcceleration(my_pos, glm::vec4(0,0,0,starMass));
+    // Calculate accelerations from other planets
+    for ( int i=0; i < N; ++i ) {
+      if ( i == index ) 
+	continue;
+      acc += calculateAcceleration(my_pos, their_pos[i]);
+    }
     return acc;
 }
 
 
-//TODO: Core force calc kernel shared memory
+//DONE: Core force calc kernel shared memory
 __device__ 
 glm::vec3 sharedMemAcc(int N, glm::vec4 my_pos, glm::vec4 * their_pos)
 {
+    __shared__ glm::vec4 shared_their_pos[blockSize];
     glm::vec3 acc = calculateAcceleration(my_pos, glm::vec4(0,0,0,starMass));
+
+    int index = threadIdx.x + (blockIdx.x * blockDim.x);
+    // Copy a segment of positions from global to shared memory
+    int num_iter = 0;
+    for ( int i = 0; i < N; i += blockDim.x ) { 
+      // Compute global memory index to pull in
+      int gbl_index = threadIdx.x + (num_iter * blockDim.x);
+      num_iter++;
+       
+      shared_their_pos[threadIdx.x] = their_pos[gbl_index];
+      // Don't forget to sync after the copy 
+      __syncthreads();
+      
+
+      // Calculate accelerations from other planets using from shared mem
+      for ( int j=0; j < blockDim.x; j++ ) {
+	  if ( i+j != index ) 
+	    acc += calculateAcceleration(my_pos, shared_their_pos[j]);
+      }
+      // Sync before next copy
+      __syncthreads();
+    }
     return acc;
+    //return calculateAcceleration(my_pos, glm::vec4(0,0,0,starMass));
 }
 
+//DONE
+__device__
+glm::vec3 Alignment( int N, glm::vec4 my_pos, glm::vec4* pos, glm::vec3* vel )
+{
+  glm::vec3 ave_vel;
+  float r2;
+  glm::vec3 d;
+  int index = threadIdx.x + (blockIdx.x * blockDim.x);
+  // Compute average velocity
+  // Compute average position
+  int cnt;
+  for ( int i=0; i < N; ++i ) {
+    if ( i == index ) 
+      continue;
+    d = glm::vec3( pos[i].x-my_pos.x, pos[i].y-my_pos.y, pos[i].z-my_pos.z); 
+    r2 = glm::dot( d, d );
+    if ( r2 < 5.0 ) {
+      ave_vel += glm::vec3( vel[i].x, vel[i].y, vel[i].z );
+      cnt ++;
+    }
+  }
+  ave_vel= ave_vel/float(cnt);
+
+  return ave_vel;
+  //return glm::vec3(0.0, 0.0, 0.0);
+} 
+
+//DONE 
+__device__
+glm::vec3 Cohesion( int N, glm::vec4 my_pos, glm::vec4* pos)
+{
+  glm::vec3 ave_pos;
+  float r2;
+  glm::vec3 d;
+  int index = threadIdx.x + (blockIdx.x * blockDim.x);
+  int cnt = 0;
+  // Compute average position weighted by distance
+  for ( int i=0; i < N; ++i ) {
+    if ( i == index ) 
+      continue;
+    d = glm::vec3( pos[i].x-my_pos.x, pos[i].y-my_pos.y, pos[i].z-my_pos.z); 
+    r2 = glm::dot( d, d );
+    if ( r2 < 5.0 ) {
+      cnt++;
+      ave_pos += glm::vec3( pos[i].x, pos[i].y, pos[i].z );
+    }
+  }
+  ave_pos = ave_pos/float(cnt);
+
+  d = glm::vec3(ave_pos.x-my_pos.x, ave_pos.y-my_pos.y, ave_pos.z-my_pos.z); 
+
+  //float r = glm::length(d) + EPSILON;
+  //float s = sqrt(1.0f/r);
+  //glm::vec3 D = glm::normalize(glm::cross(d/r,glm::vec3(0,0,1)));
+
+  //return s*D;
+
+  return glm::normalize(d);
+} 
+
+//DONE
+__device__
+glm::vec3 Seperation( int N, glm::vec4 my_pos, glm::vec4* pos  )
+{
+
+  int index = threadIdx.x + (blockIdx.x * blockDim.x);
+
+  glm::vec3 acc;
+  glm::vec3 d;
+  float r2;
+  // Compute repulsion force
+  for ( int i=0; i < N; ++i ) {
+    if ( i == index ) 
+      continue;
+    d = glm::vec3( pos[i].x-my_pos.x, pos[i].y-my_pos.y, pos[i].z-my_pos.z); 
+    r2 = glm::dot( d, d );
+    if ( r2 < 1.0 ) 
+      acc += -glm::normalize(d);
+  }
+  return acc;
+}
+
+//Simple Euler integration scheme
+__global__
+void updateF(int N, float dt, glm::vec4 * pos, glm::vec3 * vel, glm::vec3 * acc)
+{
+    int index = threadIdx.x + (blockIdx.x * blockDim.x);
+    glm::vec4 my_pos;
+    glm::vec3 accel;
+
+    if(index < N) my_pos = pos[index];
+
+    accel = ACC(N, my_pos, pos);
+
+    if(index < N) acc[index] = accel;
+}
 
 //Simple Euler integration scheme
 __global__
-void update(int N, float dt, glm::vec4 * pos, glm::vec3 * vel)
+void updateS(int N, float dt, glm::vec4 * pos, glm::vec3 * vel, glm::vec3 * acc, glm::vec3 star_position )
 {
     int index = threadIdx.x + (blockIdx.x * blockDim.x);
     if( index < N )
     {
         glm::vec4 my_pos = pos[index];
-        glm::vec3 acc = ACC(N, my_pos, pos);
-        vel[index] += acc * dt;
+
+	#if FLOCKING == 0 
+        //glm::vec3 acc = ACC(N, my_pos, pos);
+        vel[index] += acc[index] * dt;
+	#else 
+	// Align velocity with flock average
+	glm::vec3 align_vel = Alignment( N, my_pos, pos, vel );
+	// Attract towards flock average
+	glm::vec3 cohesion_vel = Cohesion( N, my_pos, pos );
+	// Repel from nearby objects
+	glm::vec3 seperation_vel = Seperation( N, my_pos, pos );
+
+	glm::vec3 weights = glm::vec3( 0.8, 0.2, 0.5 );	
+	// Need some weights
+	vel[index] = weights.x*align_vel + weights.y*cohesion_vel + weights.z*seperation_vel;
+
+	// Add in circular velocity around star
+	//glm::vec3 star_position( 1.0, 0.0, 0.0 );
+        glm::vec3 R = glm::vec3(pos[index].x-star_position.x, pos[index].y-star_position.y, pos[index].z-star_position.z);
+        float r = glm::length(R) + EPSILON;
+        float s = sqrt(G*starMass/r);
+        glm::vec3 D = glm::normalize(glm::cross(R/r,glm::vec3(0,0,1)));
+	vel[index] += 1.0f*D;
+
+	// Add in attractive velocity toward star
+	vel[index] += -0.01f*R;
+
+	// Add in damping
+	//vel[index] *= 0.7f;
+
+	#endif 
+
         pos[index].x += vel[index].x * dt;
         pos[index].y += vel[index].y * dt;
         pos[index].z += vel[index].z * dt;
@@ -191,7 +361,8 @@ void sendToPBO(int N, glm::vec4 * pos, float4 * pbo, int width, int height, floa
     if(x<width && y<height)
     {
         glm::vec3 color(0.05, 0.15, 0.3);
-        glm::vec3 acc = ACC(N, glm::vec4((x-w2)/c_scale_w,(y-h2)/c_scale_h,0,1), pos);
+	glm::vec3 acc = glm::vec3( 0.0, 0.0, 0.5 );
+        //glm::vec3 acc = ACC(N, glm::vec4((x-w2)/c_scale_w,(y-h2)/c_scale_h,0,1), pos);
         float mag = sqrt(sqrt(acc.x*acc.x + acc.y*acc.y + acc.z*acc.z));
         // Each thread writes one pixel location in the texture (textel)
         pbo[index].w = (mag < 1.0f) ? mag : 1.0f;
@@ -208,10 +379,16 @@ void initCuda(int N)
     numObjects = N;
     dim3 fullBlocksPerGrid((int)ceil(float(N)/float(blockSize)));
 
+    printf("Num Objects: %d \n", numObjects );
+    printf("Num blocks: %d, BlockSize, %d \n", fullBlocksPerGrid.x, blockSize );
+    printf("Shared: %d \n", SHARED );
+
     cudaMalloc((void**)&dev_pos, N*sizeof(glm::vec4));
     checkCUDAErrorWithLine("Kernel failed!");
     cudaMalloc((void**)&dev_vel, N*sizeof(glm::vec3));
     checkCUDAErrorWithLine("Kernel failed!");
+    cudaMalloc((void**)&dev_acc, N*sizeof(glm::vec3));
+    checkCUDAErrorWithLine("Kernel failed!");
 
     generateRandomPosArray<<<fullBlocksPerGrid, blockSize>>>(1, numObjects, dev_pos, scene_scale, planetMass);
     checkCUDAErrorWithLine("Kernel failed!");
@@ -219,10 +396,11 @@ void initCuda(int N)
     checkCUDAErrorWithLine("Kernel failed!");
 }
 
-void cudaNBodyUpdateWrapper(float dt)
+void cudaNBodyUpdateWrapper(float dt, glm::vec3 goal_position )
 {
     dim3 fullBlocksPerGrid((int)ceil(float(numObjects)/float(blockSize)));
-    update<<<fullBlocksPerGrid, blockSize>>>(numObjects, dt, dev_pos, dev_vel);
+    updateF<<<fullBlocksPerGrid, blockSize>>>(numObjects, dt, dev_pos, dev_vel, dev_acc);
+    updateS<<<fullBlocksPerGrid, blockSize>>>(numObjects, dt, dev_pos, dev_vel, dev_acc, goal_position);
     checkCUDAErrorWithLine("Kernel failed!");
 }
 
diff --git a/Part1/src/kernel.h b/Part1/src/kernel.h
index 1f8b37a..44caeba 100644
--- a/Part1/src/kernel.h
+++ b/Part1/src/kernel.h
@@ -5,6 +5,7 @@
 #include <thrust/random.h>
 #include <cuda.h>
 #include <cmath>
+#include "glm/glm.hpp"
 
 #if CUDA_VERSION >= 5000
     #include <helper_math.h>
@@ -12,12 +13,12 @@
     #include <cutil_math.h>
 #endif
 
-#define blockSize 128
+#define blockSize 256
 #define checkCUDAErrorWithLine(msg) checkCUDAError(msg, __LINE__)
-#define SHARED 0
+#define SHARED 1
 
 void checkCUDAError(const char *msg, int line);
-void cudaNBodyUpdateWrapper(float dt);
+void cudaNBodyUpdateWrapper(float dt, glm::vec3 goal_position );
 void initCuda(int N);
 void cudaUpdatePBO(float4 * pbodptr, int width, int height);
 void cudaUpdateVBO(float * vbodptr, int width, int height);
diff --git a/Part1/src/main.cpp b/Part1/src/main.cpp
index d4c9c5b..df38cf1 100644
--- a/Part1/src/main.cpp
+++ b/Part1/src/main.cpp
@@ -4,13 +4,14 @@
 
 #include "main.h"
 
-#define N_FOR_VIS 25
-#define DT 0.2
+#define N_FOR_VIS 8*1024
+#define DT 0.05
 #define VISUALIZE 1
 //-------------------------------
 //-------------MAIN--------------
 //-------------------------------
 
+
 int main(int argc, char** argv)
 {
     // Launch CUDA/GL
@@ -43,7 +44,7 @@ int main(int argc, char** argv)
 
     glutDisplayFunc(display);
     glutKeyboardFunc(keyboard);
-
+  
     glutMainLoop();
 
     return 0;
@@ -64,7 +65,7 @@ void runCuda()
     cudaGLMapBufferObject((void**)&dptrvert, planetVBO);
 
     // execute the kernel
-    cudaNBodyUpdateWrapper(DT);
+    cudaNBodyUpdateWrapper(DT, goal_position);
 #if VISUALIZE == 1
     cudaUpdatePBO(dptr, field_width, field_height);
     cudaUpdateVBO(dptrvert, field_width, field_height);
@@ -88,6 +89,7 @@ void display()
         timebase = time;
         frame = 0;
     }
+
     runCuda();
 
     char title[100];
@@ -152,6 +154,19 @@ void keyboard(unsigned char key, int x, int y)
         case(27):
             exit(1);
             break;
+
+	case('w'):
+	    goal_position.y += 0.1;
+	    break;
+	case('s'):
+	    goal_position.y -= 0.1;
+	    break;
+	case('a'):
+	    goal_position.x -= 0.1;
+	    break;
+	case('d'):
+	    goal_position.x += 0.1;
+	    break;
     }
 }
 
@@ -353,5 +368,7 @@ void deleteTexture(GLuint* tex)
 
 void shut_down(int return_code)
 {
-    exit(return_code);
+    printf( "Calling cudaDeviceReset \n" );
+    cudaDeviceReset();
+    //exit(return_code);
 }
diff --git a/Part1/src/main.h b/Part1/src/main.h
index 2b818bf..b0296c5 100644
--- a/Part1/src/main.h
+++ b/Part1/src/main.h
@@ -58,6 +58,9 @@ float zFar = 5.0;
 glm::mat4 projection;
 glm::mat4 view;
 glm::vec3 cameraPosition(1.75,1.75,1.35);
+
+glm::vec3 goal_position( 0.0, 0.0, 0.0 );
+
 //-------------------------------
 //----------CUDA STUFF-----------
 //-------------------------------
@@ -74,7 +77,7 @@ int main(int argc, char** argv);
 //---------RUNTIME STUFF---------
 //-------------------------------
 
-void runCuda();
+void runCuda( glm::vec3 goal_position );
 
 void display();
 void keyboard(unsigned char key, int x, int y);
diff --git a/README.md b/README.md
index e3122aa..1c1744f 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,164 @@
+![planets](Part1/resources/planets.png)
+
+---
 CIS565: Project 3: CUDA Simulation and GLSL Visualization
+---
+
+After completing the initial simulation requirements I chose to implement flocking. 
+Each particle has 3 behaviors:
+Alignment - the particle velocity converges to the average velocity of its neighbors
+Cohesion - the particle is attracted to the center of the flock
+Seperation - particles repel from other particles nearby so they don't collide. 
+
+I chose to have a radius of interaction for each behavior which allows for various
+overall flock behaviors. For instance a small cohesion radius generally leads to 
+particles bunching into multiple smaller flocks while a large radius will lead the
+particles to all converge to the global flock center. 
+
+To keep the particles from leaving the viewing region I added a control point that 
+each particle is drawn towards and to make things interesting I added a rotational
+velocity component to this behavior. The control point may be moved around interactively 
+using the 'wasd' keys. 
+
+Showing the particles flocking. 
+![flocking](Part1/resources/particles_flocking.png)
+
+Showing particles in a rest state ( no rotational velocity component added ). 
+![flocking](Part1/resources/particle_flock_no_spin.png)
+
+---
+Video
+---
+I posted a video of flocking up on youtube, it gets a little bit choppy due to my screencapture
+software ( I'm working on putting up a better one ), but it gets the point across. 
+
+http://youtu.be/GNqQbQlhye4
+
+---
+Performance: Shared memory 
+---
+For the N-Body forces calculation its recommended to load particle states into 
+shared memory for each block. I varied the block size for shared and non-shared
+memory and saw that performance went up somewhat from 64 to 512. And then fixing
+the blocksize at 512 ( maximum performance ) I evaluated number of particles 
+versus framerate and surprisingly didn't find a large performance difference 
+between my shared memory implementation and the naive implementation, which 
+is surprising ( I'm still investigating this ). 
+
+Note: I removed the ACC calculation for the grid for the test, allowing me
+to observe the absolute number of particles that can be handled. 
+
+particles | framerate
+
+BlockSize: 64
+
+no shared memory:
+
+- 1024 | 38.46
+- 2048 | 33.53
+- 4096 | 20.37
+- 8192 | 7.98
+- 16384 | 2.30
+- 24576 | 1.06
+
+shared memory:
+
+- 1024 | 39.68
+- 2048 | 33.80
+- 4096 | 21.32
+- 8192 | 8.64
+- 16384 | 2.57
+- 24576 | 1.19
+
+BlockSize: 256
+
+shared memory:
+
+- 1024 | 39.96
+- 2048 | 33.90
+- 4096 | 21.05
+- 8192 | 8.56
+- 16384 | 2.53
+- 24576 | 1.17
+
+no shared memory:
+- 1024 | 39.96
+- 2048 | 34.25
+- 4096 | 21.26
+- 8192 | 9.06
+- 16384 | 2.72
+- 24576 | 1.26
+
+BlockSize: 512
+
+no shared memory:
+
+- 64 | 38.31 
+- 128 | 37.11
+- 256 | 37.00
+- 512 | 37.55
+- 1024 | 37.51
+- 2048 | 33.40
+- 4096 | 21.19
+- 8192 | 8.23
+- 16384 | 2.50
+- 24576 | 1.16
+
+shared memory:
+
+- 1024 | 38.77
+- 2048 | 32.80
+- 4096 | 21.48
+- 8192 | 8.90
+- 16384 | 2.67
+- 24576 | 1.124
+
+BlockSize: 1024
+
+no shared memory:
+- 1024 | 37.51
+- 2048 | 33.40
+- 4096 | 21.19
+- 8192 | 8.23
+- 16384 | 2.50
+- 24576 | 1.16
+
+shared memory:
+- 1024 | 38.58
+- 2048 | 33.30
+- 4096 | 21.13
+- 8192 | 8.73
+- 16384 | 2.58
+- 24576 | 1.19
+
+
+Generally blocksize does not have a large effect on framerate.
+There appears to be a few percent improvement when using shared memory
+over the naive implementation. The following is the specs for my GPU
+if anyone is interested. 
+
+```
+GPU Info:
+GeForce 610M
+Total amount of global memory:                 1024 MBytes (1073414144 bytes)
+( 1) Multiprocessors, ( 48) CUDA Cores/MP:     48 CUDA Cores
+GPU Clock rate:                                1344 MHz (1.34 GHz)
+Memory Clock rate:                             800 Mhz
+Memory Bus Width:                              64-bit
+Maximum Texture Dimension Size (x,y,z)         1D=(65536), 2D=(65536, 65535), 3D=(2048, 2048, 2048)
+Maximum Layered 1D Texture Size, (num) layers  1D=(16384), 2048 layers
+Maximum Layered 2D Texture Size, (num) layers  2D=(16384, 16384), 2048 layers
+Total amount of constant memory:               65536 bytes
+Total amount of shared memory per block:       49152 bytes
+Total number of registers available per block: 32768
+Warp size:                                     32
+Maximum number of threads per multiprocessor:  1536
+Maximum number of threads per block:           1024
+Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
+Max dimension size of a grid size    (x,y,z): (65535, 65535, 65535)
+Maximum memory pitch:                          2147483647 bytes
+```
+
 ===
 Fall 2013
 ---
@@ -15,6 +175,9 @@ Lab or in Moore100 labs. All machines in the SIG Lab and Moore100 are equipped
 with CUDA capable NVIDIA graphics cards. If this too proves to be a problem,
 please contact Patrick or Liam as soon as possible.
 
+
+
+
 ---
 INTRODUCTION:
 ---