Arturus · yyyyyyhm · Jun 26, 2018 · Jun 26, 2018 · Jun 27, 2018 · Jun 28, 2018
diff --git a/.gitignore b/.gitignore
@@ -11,3 +11,11 @@ data/*.zip
 data/submission.csv.gz
 !data/2017-08-15_2017-09-11.csv.zip
 
+data/*
+*/.DS_STORE
+.DS_STORE
+images/
+ex_figs/
+
+*.png
+output/
diff --git a/Adam_HD_optimizer.py b/Adam_HD_optimizer.py
@@ -0,0 +1,92 @@
+#Copy paste from https://github.com/zadaianchuk/HyperGradientDescent/blob/master/Adam_HD_optimizer.py
+#Hypergradient Descent Optimizer
+
+
+
+
+from __future__ import division
+
+import tensorflow as tf
+
+class AdamHDOptimizer(tf.train.GradientDescentOptimizer):
+
+    def __init__(self, alpha_0, beta =10**(-7), name="HGD", mu=0.99, eps = 10**(-8),type_of_learning_rate ="global"):
+        super(AdamHDOptimizer, self).__init__(beta, name=name)
+
+        self._mu = mu
+        self._alpha_0 = alpha_0
+        self._beta = beta
+        self._eps = eps
+        self._type = type_of_learning_rate
+
+
+    def minimize(self, loss, global_step):
+
+        # Algo params as constant tensors
+        mu = tf.convert_to_tensor(self._mu, dtype=tf.float32)
+        alpha_0=tf.convert_to_tensor(self._alpha_0, dtype=tf.float32)
+        beta=tf.convert_to_tensor(self._beta, dtype=tf.float32)
+        eps = tf.convert_to_tensor(self._eps, dtype=tf.float32)
+
+        var_list = tf.trainable_variables()
+
+        # create and retrieve slot variables for:
+        # direction of previous step
+        ds = [self._get_or_make_slot(var,
+                  tf.constant(0.0, tf.float32, var.get_shape()), "direction", "direction")
+                  for var in var_list]
+        # current learning_rate alpha
+        if self._type == "global":
+            alpha = self._get_or_make_slot(alpha_0, alpha_0, "learning_rate", "learning_rate")
+        else:
+            alphas = [self._get_or_make_slot(var,
+                      tf.constant(self._alpha_0, tf.float32, var.get_shape()), "learning_rates", "learning_rates")
+                      for var in var_list]
+        #  moving average estimation
+        ms = [self._get_or_make_slot(var,
+            tf.constant(0.0, tf.float32, var.get_shape()), "m", "m")
+            for var in var_list]
+        vs = [self._get_or_make_slot(var,
+            tf.constant(0.0, tf.float32, var.get_shape()), "v", "v")
+            for var in var_list]
+        # power of mu for bias-corrected first and second moment estimate
+        mu_power = tf.get_variable("mu_power", shape=(), dtype=tf.float32, trainable=False, initializer=tf.constant_initializer(1.0))
+
+        # update moving averages of first and second moment:
+        grads = tf.gradients(loss, var_list)
+        grads_squared = [tf.square(g) for g in grads]
+        m_updates = [m.assign(mu*m + (1.0-mu)*g) for m, g in zip(ms, grads)] #new means
+        v_updates = [v.assign(mu*v + (1.0-mu)*g2) for v, g2 in zip(vs, grads_squared)]
+        mu_power_update = [tf.assign(mu_power,tf.multiply(mu_power,mu))]
+        # bais correction of the estimates
+        with tf.control_dependencies(v_updates+m_updates+mu_power_update):
+            ms_hat = [tf.divide(m,tf.constant(1.0) - mu_power) for m in ms]
+            vs_hat = [tf.divide(v,tf.constant(1.0) - mu_power) for v in vs]
+
+        #update of learning rate alpha, main difference between ADAM and ADAM-HD
+        if self._type == "global":
+            hypergrad = sum([tf.reduce_sum(tf.multiply(d,g)) for d,g in zip(ds, grads)])
+            alphas_update = [alpha.assign(alpha-beta*hypergrad)]
+        else:
+            hypergrads = [tf.multiply(d,g) for d,g in zip(ds, grads)]
+            alphas_update = [alpha.assign(alpha-beta*hypergrad) for alpha,hypergrad in zip(alphas,hypergrads)]
+
+        # update step directions
+        with tf.control_dependencies(alphas_update): #we want to be sure that alphas calculated using previous step directions
+            ds_updates=[d.assign(-tf.divide(m, tf.sqrt(v) + self._eps)) for (m,v,d) in zip(ms_hat,vs_hat,ds)]
+
+        # update parameters of the model
+        with tf.control_dependencies(ds_updates):
+                if self._type == "global":
+                    dirs = [alpha*d for  d in ds]
+                    alpha_norm = alpha
+                else:
+                    dirs = [alpha*d for  d, alpha in zip(ds,alphas)]
+                    alpha_norm = sum([tf.reduce_mean(alpha**2) for alpha in alphas])
+                variable_updates = [v.assign_add(d) for v, d in zip(var_list, dirs)]
+                global_step.assign_add(1)
+                # add summaries  (track alphas changes)
+                with tf.name_scope("summaries"):
+                    with tf.name_scope("per_iteration"):
+                        alpha_norm_sum=tf.summary.scalar("alpha", alpha_norm, collections=[tf.GraphKeys.SUMMARIES, "per_iteration"])
+        return tf.group(*variable_updates)
diff --git a/MAKEFEATURES_TRAIN_ALL.sh b/MAKEFEATURES_TRAIN_ALL.sh
@@ -0,0 +1,61 @@
+#WHen doing the chunking backtest approach, need to train/retrain model after 
+#each new chunk of training data comes in.
+
+#For this setup, just retrain from scrach (not starting at last checkpoint of 
+#previous training chunk; completely starting over again)
+
+
+# ==============================================================================
+# PARAMETERS
+# ==============================================================================
+#For each of the N training sets: train model
+#true false whether to remake feature sets  vs. just skip directly to training
+MAKE_FEATURESETS=false
+#Make some cached features for all the training/test sets
+makefeats_names="TRAINset1 TRAINset2 TRAINset3 TRAINset4 TESTset1 TESTset2 TESTset3 TESTset4"
+train_names="TRAINset1 TRAINset2 TRAINset3 TRAINset4"
+#In training, max number of epochs to do. By 25-50 things have usually plateaud
+MAX_EPOCH=50
+
+
+
+
+if $MAKE_FEATURESETS; then
+
+    echo 'Cleaning up, then remaking feature sets'
+    #Clean up between feature sets
+    cd data
+    rm -R TRAIN*
+    rm -R TEST*
+    rm -R cpt/
+    rm -R cpt_tmp/
+    rm -R logs/
+    rm *.pkl
+    cd ..
+    ll data/
+
+
+    # =============================================================================
+    # make_features.py
+    # =============================================================================
+    for v in $makefeats_names; do
+        #Create the features for our data
+        echo 'running make_features.py'
+        echo $v
+        python3 make_features.py data/$v ours daily full --add_days=0
+    done
+fi
+
+
+# =============================================================================
+# trainer.py    
+# ============================================================================= 
+for v in $train_names; do
+    echo 'running trainer.py'
+    echo $v
+    #By default, is already doing forward split, so also do side split
+    python3 trainer.py full daily --name=$v --hparam_set='encdec' --n_models=3 --asgd_decay=0.99 --max_steps=11500 --save_from_step=10 --max_epoch=$MAX_EPOCH --patience=5 --verbose --save_epochs_performance
+    # --side_split    #using the side_split option gives unrealistic values for SMAPE: 
+    #says training, side split, and forward step SMAPEs are all only 3-8 %, so clearly unrealistic. 
+    #Not sure if Kaggle guy calculated things differently when doing side_eval option??? Just leave off for now, only do forward eval.
+done
diff --git a/PERFORMANCE_HEATMAPS.py b/PERFORMANCE_HEATMAPS.py
@@ -0,0 +1,227 @@
+#Analyze the different performance metrics 
+#Make the performance heatmaps
+
+#There will be 4 different TRAIN-TEST sets, 
+#each has a model trained on that train set and tested on that test set.
+#So asssume to simulate production environment where we would retrain model 
+#every so often, we have e.g. 4 tests of the model, each with say 3 months more 
+#data appended to it. So, assume we will just do 4 separate analyses.
+
+import matplotlib
+matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+
+import os
+import numpy as np
+import pickle
+from collections import defaultdict
+
+
+# =============================================================================
+# PARAMETERS
+# =============================================================================
+OUTDIR = 'output'
+NAMES = ['TESTset1', 'TESTset2', 'TESTset3', 'TESTset4']
+
+
+
+# =============================================================================
+# MAIN
+# =============================================================================
+
+def load_dict(path):
+    with open(path,'rb') as gg:
+        d = pickle.load(gg)
+#    print(d)
+    return d
+
+
+def aggregate__overall(data_dict, real_only, id_subsets, bad_ids):
+    """
+    For each (history,horizon) pair, marginalized over all id's and dates
+
+    format is (history,horizon,backoffset,id) : {'SMAPE':smape, 'bias':bi, #'MAE':mae, 'predict_start_date':dates[0], 'predict_end_date':dates[-1]}
+    """
+#    print(data_dict.items())
+    agg_dict = defaultdict(lambda:[])
+    for k,v in data_dict.items():
+        series_id = k[3]
+        #Only use the real series, ignore the synthetic ones
+        #(synthetic series have name like {id}__... )
+        if real_only:
+            if '__' in series_id:
+                continue
+        #If have a set of holdout id's:
+        if id_subsets:
+            if series_id not in id_subsets:
+                continue
+        #Regardless of mode, if this is one of the corrupted time series, ignore it:
+        if series_id in bad_ids:
+            continue
+
+        history = k[0]
+        horizon = k[1]
+        smape = v['SMAPE']
+        agg_dict[(history,horizon)] += [smape]        
+
+
+
+    #Now get mean SMAPE
+    metrics_dict = {}
+    for k,v in agg_dict.items():
+        mean = np.nanmean(v)
+        median = np.nanmedian(v)
+        sd = np.nanstd(v)
+        pctl_5 = np.percentile([i for i in v if np.isfinite(i)],5)#nanpercentile
+        pctl_95 = np.percentile([i for i in v if np.isfinite(i)],95)
+        metrics_dict[k] = {'mean':mean, 'median':median, 'sd':sd, '5pctl':pctl_5, '95pctl':pctl_95}
+
+    histories = list(np.unique([i[0] for i in metrics_dict.keys()]))
+    horizons = list(np.unique([i[1] for i in metrics_dict.keys()]))
+#    print(metrics_dict)
+#    print(histories)
+#    print(horizons)
+
+    metrics_arrays = {}
+    for metric in ['mean','median', 'sd','5pctl','95pctl']:
+        _array = np.nan * np.ones((len(histories),len(horizons)))
+        for k,v in metrics_dict.items():
+            i = histories.index(k[0])
+            j = horizons.index(k[1])
+            _array[i,j] = v[metric]
+        metrics_arrays[metric] = _array
+    print(metrics_arrays)
+    return metrics_dict, histories, horizons, metrics_arrays
+
+
+
+
+def make_heatmap(metrics_arrays, histories, horizons, outdir, name):
+    """
+    Visualize the SMAPE values
+    """
+    #For scale, get highest value for heatmap.
+    #Use 200 (worst possible SMAPE), vs.
+    #to improve dynamic range use the highest measured SMAPE value from the heatmaps
+
+#    print('metrics_arrays')
+#    print(metrics_arrays)
+    for k,v in metrics_arrays.items():
+
+        savename = k+'_'+name
+
+        vmax = np.nanmin([200.,np.nanmax(np.ceil(v))])
+
+        plt.figure()
+        plt.imshow(v,vmin=0.,vmax=vmax)
+        plt.title(savename,fontsize=15)
+        plt.colorbar()
+        plt.xlabel('Horizon',fontsize=15)
+        plt.ylabel('History',fontsize=15)
+        plt.xticks(np.arange(len(horizons)),horizons,fontsize=15)
+        plt.yticks(np.arange(len(histories)),histories,fontsize=15)
+
+        for x, hor in enumerate(np.arange(len(horizons))):
+            for y, hist in enumerate(np.arange(len(histories))):
+                s = np.round(v[y,x],1)
+                plt.text(x-.25, y, s)
+    #    plt.grid()
+        savepath = os.path.join(outdir,f'{savename}.png')
+        plt.savefig(savepath)
+
+
+
+
+if __name__=='__main__':
+#    parser = argparse.ArgumentParser()
+#    parser.add_argument('--logdir', default='data/logs', help="Directory where numpy arrays of performance are")
+#    parser.add_argument('--K_last', default=3, dest='K_last', help='Save out per EPOCH metrics (NOT per step, only per EPOCH')
+#    args = parser.parse_args()
+#    param_dict = dict(vars(args))
+#
+#    make_heatmaps(**param_dict)
+
+
+    #for each of the 4 dicts:
+
+
+    #HOLLYWOOD
+    #Make list of id's that were held out from training, to assess transfer ability
+    HOLD_OUTS = [str(i) for i in range(500)] #Not actually held out, but just get an idea of performance on earlier ids
+    special_ids = [str(i) for i in [531, 1007, 143, 130, 197, 203, 209, 215, 342, 476, 328, 182, 200, 145, 242, 44, 94, 147, 1, 5, 6, 7, 8, 12, 387, 429, 1005, 943]]
+    id_dict = {'allIDs':[],
+         'special_ids':special_ids,
+         'holdout_ids':HOLD_OUTS}
+
+    #Some of the ID's are just bad, have multiple month long gaps from corrupted data, etc., so can ignore them
+    #For now just use everything to get conservative estimate of performance
+    BAD_IDs = []#['44','46','581','582','583','584']
+
+
+
+
+    # =============================================================================
+    # Aggregated over all 4 test sets                
+    # =============================================================================
+    all_data = {}
+    for chunkname in NAMES:
+        print('chunkname: ',chunkname)    
+        path = os.path.join(OUTDIR,f'hist_horiz__{chunkname}.pickle')
+        data = load_dict(path)    
+        new_data = {k+(chunkname,): v for k,v in data.items()}
+        all_data.update(new_data)
+
+    for real_only in [True,False]:
+        for k, id_subsets in id_dict.items():
+
+            r = 'real' if real_only else 'realAndsynthetic'
+            name = '4Ave' + '_' + r + '_' + k
+            print(name)
+
+
+            metrics_dict, histories, horizons, metrics_arrays = aggregate__overall(all_data, real_only, id_subsets, BAD_IDs)
+            make_heatmap(metrics_arrays, histories, horizons, OUTDIR, name)
+
+            #Save out the metrics dict
+            dict_savename = os.path.join(OUTDIR,f"hist_horiz__{name}__allchunks__metrics.pickle")
+            with open(dict_savename, "wb") as outp:
+                pickle.dump(metrics_dict, outp)    
+
+
+
+
+
+    # =============================================================================
+    # Individual test sets
+    # =============================================================================
+    #For the 4 chunk backtesting performance assessment
+    for chunkname in NAMES:
+        print('chunkname: ',chunkname)    
+        path = os.path.join(OUTDIR,f'hist_horiz__{chunkname}.pickle')
+        data = load_dict(path)
+
+        for real_only in [True,False]:
+            for k, id_subsets in id_dict.items():
+
+                r = 'real' if real_only else 'realAndsynthetic'
+                name = chunkname + '_' + r + '_' + k
+                print(name)
+
+
+                metrics_dict, histories, horizons, metrics_arrays = aggregate__overall(data, real_only, id_subsets, BAD_IDs)
+                make_heatmap(metrics_arrays, histories, horizons, OUTDIR, name)
+
+                #Save out the metrics dict
+                dict_savename = os.path.join(OUTDIR,f"hist_horiz__{name}__metrics.pickle")
+                with open(dict_savename, "wb") as outp:
+                    pickle.dump(metrics_dict, outp)    
+
+
+
+
+
+
+
+
+
+