import pandas as pd # Import data: the pca converted clustered data set, the set that consists of the clusters' centers, the translation/filename file # and the list of indices which represent the observations at which the eye gaze is not "researcher" centers = pd.read_csv("3785_2centupdate.csv") clus_data = pd.read_csv("3785_2clusupdate+index.csv") clus_data.drop(columns=clus_data.columns[0], axis=1, inplace=True) fn_trans = pd.read_csv("file_trans_ofniet.csv") indices = pd.read_csv("i_nores_incl-ofniet.csv") index_c = indices['indices no_res'].tolist() # Find for each cluster the observation that is closest to the cluster's center min_ind_l = {} # For each cluster, get the observations' index that belong to that cluster for n in range(0,len(centers)): i_l = [] for i, row in clus_data.iterrows(): if clus_data['cluster'][i] == n: i_l.append(i) # Save the cluster's observations in a new dataframe cl_dt = clus_data.iloc[i_l] # And remove the column from that dataframe, which consists of the cluster number cl_dt.drop(columns=cl_dt.columns[-1], axis=1, inplace=True) # Convert dataframes to list l_cl_d = cl_dt.values.tolist() l_cent = centers.values.tolist() # Keep track of the distance from the center to each observation in the cluster dis_l = [] for e in l_cl_d: dis = 0 # By using for the sum of squared difference between each index of the center # and the particular observation for ind, ele in enumerate(l_cent[n]): dis = dis + (ele - e[ind])**2 # sum of squared difference #dis = dis + abs(ele - e[ind]) # sum of absolute difference, which is a different difference measure type dis_l.append(dis) m_i = dis_l.index(min(dis_l)) # Get for each cluster the index which represents the observation that is closest to # the cluster's center #ind_min = i_l[m_i] # use this when working with the complete dataset ind_min = index_c[i_l[m_i]] # use this when working with the dataset in which the eye gaze "researcher" observations are removed obs = [ind_min,fn_trans['File'][ind_min],fn_trans['Translation'][ind_min]] obs = [ind_min] min_ind_l[n] = obs print(min_ind_l) # Dataframe that consist of the number of clusters and the corresponding index which represents # the observation that is closest to the cluster's center # df = pd.DataFrame(min_ind_l) # df.to_csv('./clo_to_cen_(511)SSE.csv', index=False)