feature extraction code

sabrina-hoppe · May 5, 2018 · cebde1b · cebde1b
1 parent a20084f
commit cebde1b
Show file tree

Hide file tree

Showing 9 changed files with 1,279 additions and 2 deletions.
diff --git a/00_compute_features.py b/00_compute_features.py
@@ -0,0 +1,103 @@
+import numpy as np
+import os
+from config import conf as conf
+from featureExtraction import gaze_analysis as ga
+import threading
+import getopt
+import sys
+from config import names as gs
+
+def compute_sliding_window_features(participant, ws, gazeAnalysis_instance):
+	"""
+	calls the gazeAnalysis instance it was given, calls it to get features and saves those to file
+	"""
+	window_features, window_times = gazeAnalysis_instance.get_window_features(ws, conf.get_step_size(ws))
+	np.save(conf.get_window_features_file(participant, ws), window_features)
+	np.save(conf.get_window_times_file(participant, ws), window_times)
+
+if __name__ == "__main__":
+	for p in xrange(0,conf.n_participants):
+		threads = []  # one thread per time window will be used and collected in this list
+
+		# create data folder, plus one subfolder for participant p
+		if not os.path.exists(conf.get_feature_folder(p)):
+			os.makedirs(conf.get_feature_folder(p))
+
+		# make sure all relevant raw data files exist in the right folder
+		gaze_file = conf.get_data_folder(p) + '/gaze_positions.csv'
+		pupil_diameter_file = conf.get_data_folder(p) + '/pupil_diameter.csv'
+		events_file = conf.get_data_folder(p) + '/events.csv'
+		assert os.path.exists(gaze_file) and os.path.exists(pupil_diameter_file) and os.path.exists(events_file)
+
+		# load relevant data
+		gaze = np.genfromtxt(gaze_file, delimiter=',', skip_header=1)
+		pupil_diameter = np.genfromtxt(pupil_diameter_file, delimiter=',', skip_header=1)
+		events = np.genfromtxt(events_file, delimiter=',', skip_header=1, dtype=str)
+
+		# create instance of gazeAnalysis class that will be used for feature extraction
+		# this already does some initial computation that will be useful for all window sizes:
+		extractor = ga.gazeAnalysis(gaze, conf.fixation_radius_threshold, conf.fixation_duration_threshold,
+													conf.saccade_min_velocity, conf.max_saccade_duration,
+													pupil_diameter=pupil_diameter, event_strings=events)
+
+		# compute sliding window features by creating one thread per window size
+		for window_size in conf.all_window_sizes:
+			if not os.path.exists(conf.get_window_features_file(p, window_size)):
+				thread = threading.Thread(target=compute_sliding_window_features, args=(p, window_size, extractor))
+				thread.start()
+				threads.append(thread)
+
+		for t in threads:
+			t.join()
+
+		print 'finished all features for participant', p
+
+	# Merge the features from all participants into three files per window_size:
+	# merged_features includes all features
+	# merged_traits contains the ground truth personality score ranges
+	# merged_ids contains the participant number and context (way, shop, half of the recording)
+
+	# load ground truth from info folder:
+	binned_personality = np.genfromtxt(conf.binned_personality_file, delimiter=',', skip_header=1)
+	trait_labels = np.loadtxt(conf.binned_personality_file, delimiter=',', dtype=str)[0,:]
+	annotation = np.genfromtxt(conf.annotation_path, delimiter=',', skip_header=1)
+
+	for window_size in conf.all_window_sizes:
+		print 'merging window size', window_size
+
+		windowfeats_subtask_all = []
+		windowfeats_subtask_ids = []
+		windowfeats_subtask_all_y = []
+
+		for p in xrange(0, conf.n_participants):
+			featfilename = conf.get_window_features_file(p, window_size)
+			timesfilename = conf.get_window_times_file(p, window_size)
+			if os.path.exists(featfilename) and os.path.exists(timesfilename):
+				data = np.load(featfilename).tolist()
+				windowfeats_subtask_all.extend(data)
+				windowfeats_subtask_all_y.extend([binned_personality[p, 1:]] * len(data))
+
+				times = np.load(timesfilename)[:, 2:]
+				ann = annotation[p,1:]
+
+				ids_annotation = np.zeros((len(data), 3), dtype=int) # person, way/shop, half
+				ids_annotation[:,0] = p
+				ids_annotation[(times[:,1] < ann[0]),1] = conf.time_window_annotation_wayI
+				ids_annotation[(times[:,0] > ann[0]) & (times[:,1] < ann[1]),1] = conf.time_window_annotation_shop
+				ids_annotation[(times[:,0] > ann[1]),1] = conf.time_window_annotation_wayII
+				ids_annotation[:(len(data)/2), 2] = conf.time_window_annotation_halfI
+				ids_annotation[(len(data)/2):, 2] = conf.time_window_annotation_halfII
+
+				windowfeats_subtask_ids.extend(ids_annotation.tolist())
+			else:
+				print 'did not find ', featfilename
+				sys.exit(1)
+
+		ids = np.array(windowfeats_subtask_ids)
+		x = np.array(windowfeats_subtask_all, dtype=float)
+		y = np.array(windowfeats_subtask_all_y)
+		f1, f2, f3 = conf.get_merged_feature_files(window_size)
+
+		np.savetxt(f1, x, delimiter=',', header=','.join(gs.full_long_label_list), comments='')
+		np.savetxt(f2, y, delimiter=',', header=','.join(trait_labels), comments='')
+		np.savetxt(f3, ids, delimiter=',', header='Participant ID', comments='')
diff --git a/README.md b/README.md
@@ -1,8 +1,7 @@
 # Eye movements during everyday behavior predict personality traits
 *Sabrina Hoppe, Tobias Loetscher, Stephanie Morey and Andreas Bulling*
 
-This repository provides all data used for the publication [in Frontiers in Human Neuroscience](https://dx.doi.org/10.3389/fnhum.2018.00105).  
-Code is coming soon!
+This repository provides all data and code used for the publication [in Frontiers in Human Neuroscience](https://dx.doi.org/10.3389/fnhum.2018.00105).  
 
 ## Dataset
    *  Gaze data recorded at 60Hz from 42 participants is stored in `data/ParticipantXX`.  
@@ -20,6 +19,14 @@ Code is coming soon!
 
    * Timestamps indicating the times when participants entered and left the shop are given in `info/annotation.csv` in seconds.  
 
+
+## Code
+reproducing the paper results step by step:
+1. __Extract features from raw gaze data__:    
+   `python compute_features.py` to compute gaze features for all participants  
+   Once extracted, the features are stored in `features/ParticipantXX/window_features_YY.npy` where XX is the participant number and YY the length of the sliding window in seconds.  
+
+
 ## Citation  
 If you want to cite this project, please use the following Bibtex format:
 

diff --git a/__init__.py b/__init__.py
@@ -0,0 +1 @@
+
diff --git a/config/__init__.py b/config/__init__.py
diff --git a/config/conf.py b/config/conf.py
@@ -0,0 +1,97 @@
+import numpy as np
+
+# global parameters
+n_participants = 42
+n_traits = 7
+max_n_feat = 207
+max_n_iter = 100
+all_window_sizes = [5, 15, 30, 45, 60, 75, 90, 105, 120, 135]
+all_shop_window_sizes = [5, 15]  # at least 3/4 of the people have a time window in these times
+
+# cross validation paramters
+n_inner_folds = 3
+n_outer_folds = 5
+
+# Random Forest Parameters
+tree_max_features = 15
+tree_max_depth = 5
+n_estimators = 100
+max_n_jobs = 5
+
+# given a window size, determine step size correctly for even and odd numbers
+def get_step_size(window_size):
+	step_size = window_size / 2.0
+	if step_size * 10 % 2 == 0:
+		step_size = int(step_size)
+	return step_size
+
+# relative paths
+data_folder = 'data'
+info_folder = 'info'
+feature_folder = 'features'
+result_folder = 'results'
+figure_folder = 'figures'
+annotation_path = info_folder + '/annotation.csv'
+binned_personality_file = info_folder + '/binned_personality.csv'
+personality_sex_age_file = info_folder + '/personality_sex_age.csv'
+
+# load the personality trait names from file and map them to abbreviations
+traitlabels = np.loadtxt(binned_personality_file, delimiter=',', dtype=str)[0, 1:]
+def get_abbr(s):
+	return ''.join(item[0] for item in s.split() if item[0].isupper())
+medium_traitlabels = [get_abbr(s) if (" " in s) else s for s in traitlabels]
+short_traitlabels = [''.join(item[0] for item in tl.split() if item[0].isupper()) for tl in traitlabels]
+
+
+# dynamically create relative paths for result files to create
+def get_result_folder(annotation_val):
+	return result_folder + '/A' + str(annotation_val)
+
+def get_result_filename(annotation_val, trait, shuffle_labels, i, add_suffix=False):
+	filename = get_result_folder(annotation_val) + '/' + short_traitlabels[trait]
+	if shuffle_labels:
+		filename += '_rnd'
+	filename += '_' + str(i).zfill(3)
+	if add_suffix:
+		filename += '.npz'
+	return filename
+
+def get_feature_folder(participant):
+	return feature_folder + '/Participant' + str(participant).zfill(2)
+
+def get_merged_feature_files(window_size):
+	return feature_folder + '/merged_features_' + str(window_size) + '.csv', feature_folder + '/merged_traits_' + str(window_size) + '.csv', feature_folder + '/merged_ids_' + str(window_size) + '.csv'
+
+def get_data_folder(participant):
+	return data_folder + '/Participant' + str(participant).zfill(2)
+
+def get_window_times_file(participant, window_size):
+	return get_feature_folder(participant) + "/window_times_" + str(window_size) + '.npy'
+
+def get_window_features_file(participant, window_size):
+	return get_feature_folder(participant) + "/window_features_" + str(window_size) + '.npy'
+
+def get_overall_features_file(participant):
+	return get_feature_folder(participant) + "/overall_features.npy"
+
+
+# parameters for fixation/saccade detection
+fixation_radius_threshold = 0.025
+fixation_duration_threshold = 0.1
+saccade_min_velocity = 2
+max_saccade_duration = 0.5
+
+# annotation constants (as given as arguments to train_classifier, and as used for file names in result_folder)
+annotation_all = 0
+annotation_ways = 1
+annotation_shop = 2
+annotation_values = [annotation_all, annotation_ways, annotation_shop]
+
+# annotations used in merged_ids_* files in the feature_folder
+# column 1
+time_window_annotation_wayI = 1
+time_window_annotation_shop = 2
+time_window_annotation_wayII = 3
+# column 2
+time_window_annotation_halfI = 1
+time_window_annotation_halfII = 2
diff --git a/config/names.py b/config/names.py
@@ -0,0 +1,160 @@
+fixations_list_labels = ['mean x', 'mean y',
+                         'var x', 'var y',
+                         't start', 't end',
+                         'start index', 'end index',
+                         'mean diameter', 'var diameter',
+                         'mean successive angles', 'var successive angles'
+                         ]
+fix_mean_x_i = 0
+fix_mean_y_i = 1
+fix_var_x_i = 2
+fix_var_y_i = 3
+fix_start_t_i = 4
+fix_end_t_i = 5
+fix_start_index_i = 6
+fix_end_index_i = 7
+fix_mean_diam_i = 8
+fix_var_diam_i = 9
+fix_mean_succ_angles = 10
+fix_var_succ_angles = 11
+
+saccades_list_labels = ['start x', 'start y',
+                        'end x', 'end y',
+                        'angle',
+                        't start', 't end',
+                        'start index', 'end index',
+                        'mean diameter', 'var diameter',
+                        'peak velocity', 'amplitude',
+                        ]
+
+sacc_start_x_i = 0
+sacc_start_y_i = 1
+sacc_end_x_i = 2
+sacc_end_y_i = 3
+sacc_angle_i = 4
+sacc_t_start_i = 5
+sacc_t_end_i = 6
+sacc_start_index_i = 7
+sacc_end_index_i = 8
+sacc_mean_diam_i = 9
+sacc_var_diam_i = 10
+sacc_peak_vel_i = 11
+sacc_amplitude_i = 12
+
+blink_list_labels = ['t start', 't end', 'start index', 'end index']
+
+blink_start_t_i = 0
+blink_end_ti_i = 1
+blink_start_index_i = 2
+blink_end_index_i = 3
+
+event_feature_labels = ['fixation rate', 'saccade rate',  # 0 1
+                        'small sacc. rate', 'large sacc. rate', 'positive sacc. rate', 'negative sacc. rate',  # 2 3 4 5
+                        'ratio sacc - fix',  # 6
+                        'ratio small sacc', 'ratio large sacc', 'ratio right sacc', 'ratio left sacc',  # 7 8 9 10
+                        'mean sacc amplitude', 'var sacc amplitude', 'min sacc amplitude', 'max sacc amplitude',  #11 12 13 14
+                        'mean peak velocity', 'var peak velocity', 'min peak velocity', 'max peak velocity',  # 15 16 17 18
+                        'mean mean diameter sacc', 'var mean  diameter sacc', 'mean var diameter sacc',  # 19 20 21 22
+                        'var var diameter sacc',
+                        'mean fix duration', 'var fix duration', 'min fix duration', 'max fix duration',  # 23 24 25 26
+                        'dwelling time',
+                        'mean mean subsequent angle', 'var mean subsequent angle', 'mean var subsequent angle', 'var var subsequent angle',
+                        'mean var x', 'mean var y', 'var var x', 'var var y',  # 27 28 29 30
+                        'mean mean diameter fix', 'var mean diameter fix', 'mean var diameter fix', 'var var diameter fix',  # 31 32 33 34
+                        'mean blink duration', 'var blink duration', 'min blink duration', 'max blink duration',  # 35 36 37 38
+                        'blink rate'  # 39
+                        ]
+
+event_feature_labels_long = ['fixation rate', 'saccade rate',  # 0 1
+                             'small saccade rate', 'large saccade rate', 'positive saccade rate', 'negative saccade rate',  # 2 3 4 5
+                             'saccade:fixation ratio',  # 6
+                             'ratio of small saccades', 'ratio of large saccades', 'ratio of right saccades', 'ratio of left saccades',  # 7 8 9 10
+                             'mean saccade amplitude', 'var saccade amplitude', 'min saccade amplitude', 'max saccade amplitude',  #11 12 13 14
+                             'mean saccadic peak velocity', 'var saccadic peak velocity', 'min saccadic peak velocity', 'max saccadic peak velocity',  # 15 16 17 18
+                             'mean of the mean pupil diameter during saccades', 'var of the mean pupil diameter during saccades',
+                             'mean of the var pupil diameter during saccades', 'var of the var pupil diameter during saccades', # 19 20 21 22
+                             'mean fixation duration', 'var fixation duration', 'min fixation duration', 'max fixation duration',  # 23 24 25 26
+                             'dwelling time',
+                             'mean of the mean of subsequent angles', 'var of the mean of subsequent angles',
+                             'mean of the var of subsequent angles', 'var of the var of subsequent angles',
+                             'mean of the var of x', 'mean of the var of y', 'var of the var of x', 'var of the var of y',  # 27 28 29 30
+                             'mean of the mean pupil diameter during fixations', 'var of the mean pupil diameter during fixations',
+                             'mean of the var pupil diameter during fixations', 'var of the var pupil diameter during fixations',  # 31 32 33 34
+                             'mean blink duration', 'var blink duration', 'min blink duration', 'max blink duration',  # 35 36 37 38
+                             'blink rate'  # 39
+                            ]
+
+def get_wordbook_feature_labels(movement_abbreviation):
+    return [movement_abbreviation + s + ' WB' + str(n) for n in [1, 2, 3, 4] for s in ['>0', 'max', 'min', 'arg max', 'arg min', 'range', 'mean', 'var']]
+
+def get_wordbook_feature_labels_long(movement_abbreviation):
+    return [s1 + str(n) + '-gram ' + movement_abbreviation + s2 for n in [1, 2, 3, 4]
+                                                 for (s1, s2) in [('number of different ', ' movements'),
+                                                                  ('max frequency ', ' movements'),
+                                                                  ('min frequency ', ' movements'),
+                                                                  ('most frequent ', ' movement'),
+                                                                  ('least frequent ', ' movement'),
+                                                                  ('range of frequencies of ', ' movements'),
+                                                                  ('mean frequency of ', ' movements'),
+                                                                  ('var frequency of ', ' movements')
+                                                                  ]]
+
+position_feature_labels = ['mean x', 'mean y', 'mean diameter',
+                           'min x', 'min y', 'min diameter',
+                           'max x', 'max y', 'max diameter',
+                           'min-max x', 'min-max y', 'min-max diameter',
+                           'std x', 'std y', 'std diameter',
+                           'median x', 'median y', 'median diameter',
+                           '1st quart x', '1st quart y', '1st quart diameter',
+                           '3rd quart x', '3rd quart y', '3rd quart diameter',
+                           'IQR x', 'IQR y', 'IQR diameter',
+                           'mean abs diff x', 'mean abs diff y', 'mean abs diff diameter',
+                           'mean diff x', 'mean diff y', 'mean diff diameter',
+                           'mean subsequent angle'
+                           ]
+
+position_feature_labels_long = ['mean x', 'mean y', 'mean pupil diameter',
+                                'minimum x', 'minimum y', 'minimum pupil diameter',
+                                'maximum x', 'maximum y', 'maximum pupil diameter',
+                                'range x', 'range y', 'range pupil diameter',
+                                'std x', 'std y', 'std pupil diameter',
+                                'median x', 'median y', 'median pupil diameter',
+                                '1st quartile x', '1st quartile y', '1st quartile pupil diameter',
+                                '3rd quartile x', '3rd quartile y', '3rd quartile pupil diameter',
+                                'inter quartile range x', 'inter quartile range y', 'inter quartile range pupil diameter',
+                                'mean difference of subsequent x', 'mean difference of subsequent y', 'mean difference of subsequent pupil diameters',
+                                'mean diff x', 'mean diff y', 'mean diff pupil diameter',
+                                'mean subsequent angle'
+                                ]
+
+heatmap_feature_labels = ['heatmap_'+str(i).zfill(2) for i in xrange(0, 64)]
+heatmap_feature_labels_long = ['heatmap cell '+str(i).zfill(2) for i in xrange(0, 64)]
+
+full_label_list = event_feature_labels + heatmap_feature_labels + position_feature_labels + \
+                  get_wordbook_feature_labels('sacc.') + get_wordbook_feature_labels('SF')
+
+full_long_label_list = event_feature_labels_long + heatmap_feature_labels_long + position_feature_labels_long + \
+                  get_wordbook_feature_labels_long('sacc.') + get_wordbook_feature_labels_long('SF')
+
+
+sacc_dictionary = ['A', 'B', 'C', 'R', 'E', 'F', 'G', 'D', 'H', 'J', 'K', 'L', 'M', 'N', 'O', 'U', 'u', 'b', 'r', 'f',
+					'd', 'j', 'l', 'n']
+sacc_bins_two = [a+b for a in sacc_dictionary for b in sacc_dictionary]
+sacc_bins_three = [a+b+c for a in sacc_dictionary for b in sacc_dictionary for c in sacc_dictionary]
+sacc_bins_four = [a+b+c+d for a in sacc_dictionary for b in sacc_dictionary for c in sacc_dictionary for d in sacc_dictionary]
+sacc_bins = [sacc_dictionary, sacc_bins_two, sacc_bins_three, sacc_bins_four]
+
+saccFix_dictionary = ['S_lu', 'S_ld', 'S_lr', 'S_ll', 'S_su', 'S_sd', 'S_sr', 'S_sl', 'F_l', 'F_s']
+saccFix_bins_two = [a+b for a in saccFix_dictionary for b in saccFix_dictionary]
+saccFix_bins_three = [a+b+c for a in saccFix_dictionary for b in saccFix_dictionary for c in saccFix_dictionary]
+saccFix_bins_four = [a+b+c+d for a in saccFix_dictionary for b in saccFix_dictionary for c in saccFix_dictionary for d in saccFix_dictionary]
+saccFix_bins = [saccFix_dictionary, saccFix_bins_two, saccFix_bins_three, saccFix_bins_four]
+
+def write_pami_feature_labels_to_file(targetfile):
+    f = open(targetfile, 'w')  # creates if it does not exist
+    f.write(',short,long\n')
+    i = 0
+    for item1, item2 in zip(full_label_list, full_long_label_list):
+        f.write(str(i) + ',' + item1 + ',' + item2 + '\n')
+        i += 1
+    f.close()
diff --git a/featureExtraction/__init__.py b/featureExtraction/__init__.py