Clusters in Effects of Nicotine Dependency

 Introduction

Nesarc dataset is used for clustering people who are between 18 and 25, having recently alcohol consumption and/or nicotine dependency. The dataset is also included panic disorder and experience depression in life.

Getting Data


Everyday smoker in the past 12 months selected for cluster data.


Preparing Data



NDSymptoms : Nicotine dependency symptoms calculated in 8 different criteria.


NICOTINEDEP : Equation
TAB12MDX = 1 is nicotine dependency .



PANIC : All panic symptoms are in the equation in data.


ETHRACE
: Etnicity

0. Hispanic or Latino 
1. White, Not Hispanic or Latino
2. Black, Not Hispanic or Latino
3. American Indian/Alaska Native, Not Hispanic or Latino
4. Asian/Native Hawaiian/Pacific Islander, Not Hispanic or Latino

Plotting Elbow Method


The elbow method shows that the cluster number is three.

Modelling the Cluster



It looks like there is a good separation between the clusters.

Cluster Groups




The first cluster has the highest nicotine dependency and the second one has the lowest one.

OLS Regression Result

To understand how the clusters differ on GENERAL ANXIETY(S9Q1A) with an analysis of variance, means tables and a tukey test:



The p-value from the analysis of variance is much higher than 0.05, indicates that the differences between clusters was not significant. We can try higher cluster to test the data more.  

CODE

# -*- coding: utf-8 -*- """ Created on Tue Sep 1 09:54:18 2015 @author: jrose01 """ import numpy import pandas from pandas import Series, DataFrame import statsmodels.api as sm import seaborn import statsmodels.formula.api as smf import statsmodels.stats.multicomp as multi import matplotlib.pylab as plt from sklearn.model_selection import train_test_split from sklearn import preprocessing from sklearn.cluster import KMeans # bug fix for display formats to avoid run time errors pandas.set_option('display.float_format', lambda x:'%.2f'%x) data = pandas.read_csv('data/nesarc_pds.csv', low_memory=False) ############################################################################## # DATA MANAGEMENT ############################################################################## # Age is below 25 # USUAL FREQUENCY WHEN SMOKED CIGARETTES = Everyday # Smoked cigarettes in the past 12 months data=data[(data['AGE']<=25) & (data['CHECK321']==1) & (data['S3AQ3B1']==1) & (data['IDNUM']!=20346) & (data['IDNUM']!=36471) & (data['IDNUM']!=28724)] #setting variables you will be working with to numeric def SetColumns(cols): for col in cols: data[col] = pandas.to_numeric(data[col], errors='coerce') Columns = ['IDNUM','TAB12MDX','CONSUMER','NDSymptoms','MAJORDEPLIFE', 'SOCPDLIFE', 'S3AQ3C1', 'AGE', 'SEX', 'S3AQ3B1','S3AQ8B7J', 'CHECK321', 'S3AQ8B11', 'S3AQ8B12', 'S3AQ8B13', 'IDNUM', 'TAB12MDX'] ColumnsForNicotine = [ 'S3AQ8B7A', 'S3AQ8B7B', 'S3AQ8B7C', 'S3AQ8B7D', 'S3AQ8B7E', 'S3AQ8B7F', 'S3AQ8B7G', 'S3AQ8B7H'] ColumnsForPanicAttack = ['S6Q1','S6Q2','S6Q3','S6Q7','S6Q61', 'S6Q62','S6Q63','S6Q64','S6Q65','S6Q66','S6Q67','S6Q68','S6Q69', 'S6Q610','S6Q611','S6Q612','S6Q613'] SetColumns(Columns) SetColumns(ColumnsForNicotine) SetColumns(ColumnsForPanicAttack) data['S3AQ3C1']=data['S3AQ3C1'].replace(99, numpy.nan) # Current Tolerance criteria #1 DSM-IV def crit1 (row): if row['S3AQ8B11']==1 or row['S3AQ8B12'] == 1 : return 1 elif row['S3AQ8B11']==2 and row['S3AQ8B12']==2 : return 0 data['crit1'] = data.apply (lambda row: crit1 (row),axis=1) chk2 = data['crit1'].value_counts(sort=False, dropna=False) print (chk2) chk3 = data['S3AQ8B11'].value_counts(sort=False, dropna=False) print (chk3) chk4 = data['S3AQ8B12'].value_counts(sort=False, dropna=False) print (chk4) print (pandas.crosstab(data['S3AQ8B11'], data['S3AQ8B12'])) c1 = data['S3AQ8B7J'].value_counts(sort=False, dropna=False) print (c1) # check recode def NicotineRecoding(cols): recode1 = {1: 1, 2: 0} for col in cols: data[col]=data[col].replace(9, numpy.nan) data[col]= data[col].map(recode1) NicotineRecoding(ColumnsForNicotine) # after recoding 9s to missing chk1c = data['S3AQ8B7J'].value_counts(sort=False, dropna=False) print (chk1c) # sum symptoms data['CWITHDR_COUNT'] = numpy.nansum([data['S3AQ8B7A'], data['S3AQ8B7B'], data['S3AQ8B7C'], data['S3AQ8B7D'], data['S3AQ8B7E'], data['S3AQ8B7F'], data['S3AQ8B7G'], data['S3AQ8B7H']], axis=0) # check to make sure sum code worked chksum=data[['IDNUM','S3AQ8B7A', 'S3AQ8B7B', 'S3AQ8B7C', 'S3AQ8B7D', 'S3AQ8B7E', 'S3AQ8B7F', 'S3AQ8B7G', 'S3AQ8B7H', 'CWITHDR_COUNT']] chksum.head(n=50) chk1d = data['CWITHDR_COUNT'].value_counts(sort=False, dropna=False) print (chk1d) # withdrawal (yes/no) def crit2 (row): if row['CWITHDR_COUNT']>=4 or row['S3AQ8B7J']==1: return 1 elif row['CWITHDR_COUNT']<4 and row['S3AQ8B7J']!=1: return 0 data['crit2'] = data.apply (lambda row: crit2 (row),axis=1) print (pandas.crosstab(data['CWITHDR_COUNT'], data['crit2'])) #Current Larger amount or longer period criteria #3 DSM-IV recode1 = {1: 1, 2: 0} data['S3AQ8B13']=data['S3AQ8B13'].replace(9, numpy.nan) data['S3AQ8B13']= data['S3AQ8B13'].map(recode1) chk1d = data['S3AQ8B13'].value_counts(sort=False, dropna=False) print (chk1d) #Current Cut down criteria #4 DSM-IV data['S3AQ8B6'] = pandas.to_numeric(data['S3AQ8B6'], errors='coerce') data['S3AQ8B1'] = pandas.to_numeric(data['S3AQ8B1'], errors='coerce') def crit4 (row): if row['S3AQ8B6']==1 or row['S3AQ8B1'] == 1 : return 1 elif row['S3AQ8B6']==2 and row['S3AQ8B1']==2 : return 0 data['crit4'] = data.apply (lambda row: crit4 (row),axis=1) chk1e = data['crit4'].value_counts(sort=False, dropna=False) print (chk1e) #Current Substance activities criteria #5 DSM-IV data['S3AQ8B5'] = pandas.to_numeric(data['S3AQ8B5'], errors='coerce') data['S3AQ8B5']=data['S3AQ8B5'].replace(9, numpy.nan) data['S3AQ8B5']= data['S3AQ8B5'].map(recode1) chk1f = data['S3AQ8B5'].value_counts(sort=False, dropna=False) print (chk1f) #Current Reduce activities criteria #6 DSM-IV data['S3AQ8B2'] = pandas.to_numeric(data['S3AQ8B2'], errors='coerce') data['S3AQ8B3'] = pandas.to_numeric(data['S3AQ8B3'], errors='coerce') def crit6 (row): if row['S3AQ8B2']==1 or row['S3AQ8B3'] == 1 : return 1 elif row['S3AQ8B2']==2 and row['S3AQ8B3']==2 : return 0 data['crit6'] = data.apply (lambda row: crit6 (row),axis=1) chk1g = data['crit6'].value_counts(sort=False, dropna=False) print (chk1g) #Current use continued despite knowledge of physical or psychological problem criteria #7 DSM-IV data['S3AQ8B4'] = pandas.to_numeric(data['S3AQ8B4'], errors='coerce') data['S3AQ8B14'] = pandas.to_numeric(data['S3AQ8B14'], errors='coerce') def crit7 (row): if row['S3AQ8B4']==1 or row['S3AQ8B14'] == 1 : return 1 elif row['S3AQ8B4']==2 and row['S3AQ8B14']==2 : return 0 data['crit7'] = data.apply (lambda row: crit7 (row),axis=1) chk1h = data['crit7'].value_counts(sort=False, dropna=False) print (chk1h) # sum all symptoms (np.nansum allows rows with some missing values to count all valid values) data['NDSymptoms'] = numpy.nansum([data['crit1'], data['crit2'], data['S3AQ8B13'], data['crit4'], data['S3AQ8B5'], data['crit6'], data['crit7']], axis=0 ) chk2 = data['NDSymptoms'].value_counts(sort=False, dropna=False) print (chk2) c1 = data["MAJORDEPLIFE"].value_counts(sort=False, dropna=False) print(c1) c2 = data["AGE"].value_counts(sort=False, dropna=False) print(c2) # binary nicotine dependence def NICOTINEDEP (x): if x['TAB12MDX']==1: return 1 else: return 0 data['NICOTINEDEP'] = data.apply (lambda x: NICOTINEDEP (x), axis=1) print (pandas.crosstab(data['TAB12MDX'], data['NICOTINEDEP'])) # rename variables data.rename(columns={'S3AQ3C1': 'NUMBERCIGSMOKED'}, inplace=True) c6 = data["NUMBERCIGSMOKED"].value_counts(sort=False, dropna=False) print(c6) def PANIC (x1): if ((x1['S6Q1']==1 and x1['S6Q2']==1) or (x1['S6Q2']==1 and x1['S6Q3']==1) or (x1['S6Q3']==1 and x1['S6Q61']==1) or (x1['S6Q61']==1 and x1['S6Q62']==1) or (x1['S6Q62']==1 and x1['S6Q63']==1) or (x1['S6Q63']==1 and x1['S6Q64']==1) or (x1['S6Q64']==1 and x1['S6Q65']==1) or (x1['S6Q65']==1 and x1['S6Q66']==1) or (x1['S6Q66']==1 and x1['S6Q67']==1) or (x1['S6Q67']==1 and x1['S6Q68']==1) or (x1['S6Q68']==1 and x1['S6Q69']==1) or (x1['S6Q69']==1 and x1['S6Q610']==1) or (x1['S6Q610']==1 and x1['S6Q611']==1) or (x1['S6Q611']==1 and x1['S6Q612']==1) or (x1['S6Q612']==1 and x1['S6Q613']==1) or (x1['S6Q613']==1 and x1['S6Q7']==1) or x1['S6Q7']==1): return 1 else: return 0 data['PANIC'] = data.apply (lambda x1: PANIC (x1), axis=1) c7 = data["PANIC"].value_counts(sort=False, dropna=False) print(c7) # 4 category ethnicity variable data['ETHRACE2A'] = pandas.to_numeric(data['ETHRACE2A'], errors='coerce') recode2 = {1: 1, 2: 2, 3: 3, 4: 3, 5: 0} data['ETHRACE2A'] = data['ETHRACE2A'].replace(9, numpy.nan) data['ETHRACE'] = data['ETHRACE2A'].map(recode2) c8 = data["ETHRACE2A"].value_counts(sort=False, dropna=False) print(c8) c9 = data["ETHRACE"].value_counts(sort=False, dropna=False) print(c9) ############################################################################## # END DATA MANAGEMENT ############################################################################## ############################################################################## # MODELING AND TRAINNING ############################################################################## cluster_column = ['AGE', 'SEX', 'MAJORDEPLIFE', 'CONSUMER', 'CWITHDR_COUNT', 'NDSymptoms', 'MAJORDEPLIFE', 'NICOTINEDEP', 'NUMBERCIGSMOKED', 'PANIC', 'ETHRACE'] def SetFloat(cols): for col in cols: data[col]=preprocessing.scale(data[col].astype('float64')) SetFloat(cluster_column) cluster_data = data[['AGE', 'SEX', 'MAJORDEPLIFE', 'CONSUMER', 'CWITHDR_COUNT', 'NDSymptoms', 'MAJORDEPLIFE', 'NICOTINEDEP', 'NUMBERCIGSMOKED', 'PANIC', 'ETHRACE']] cluster_data = cluster_data.dropna() # split data into train and test sets clus_train, clus_test = train_test_split(cluster_data, test_size=.3, random_state=123) # k-means cluster analysis for 1-9 clusters from scipy.spatial.distance import cdist clusters=range(1,10) meandist=[] for k in clusters: model=KMeans(n_clusters=k) model.fit(clus_train) clusassign=model.predict(clus_train) meandist.append(sum(numpy.min(cdist(clus_train, model.cluster_centers_, 'euclidean'), axis=1)) / clus_train.shape[0]) """ Plot average distance from observations from the cluster centroid to use the Elbow Method to identify number of clusters to choose """ plt.plot(clusters, meandist) plt.xlabel('Number of clusters') plt.ylabel('Average distance') plt.title('Selecting k with the Elbow Method') plt.show() # Interpret 3 cluster solution model3=KMeans(n_clusters=3) model3.fit(clus_train) clusassign=model3.predict(clus_train) # plot clusters from sklearn.decomposition import PCA pca_2 = PCA(2) plot_columns = pca_2.fit_transform(clus_train) plt.scatter(x=plot_columns[:,0], y=plot_columns[:,1], c=model3.labels_,) plt.xlabel('Canonical variable 1') plt.ylabel('Canonical variable 2') plt.title('Scatterplot of Canonical Variables for 3 Clusters') plt.show() """ BEGIN multiple steps to merge cluster assignment with clustering variables to examine cluster variable means by cluster """ # create a unique identifier variable from the index for the # cluster training data to merge with the cluster assignment variable clus_train.reset_index(level=0, inplace=True) # create a list that has the new index variable cluslist=list(clus_train['index']) # create a list of cluster assignments labels=list(model3.labels_) # combine index variable list with cluster assignment list into a dictionary newlist=dict(zip(cluslist, labels)) newlist # convert newlist dictionary to a dataframe newclus=DataFrame.from_dict(newlist, orient='index') newclus # rename the cluster assignment column newclus.columns = ['cluster'] # now do the same for the cluster assignment variable # create a unique identifier variable from the index for the # cluster assignment dataframe # to merge with cluster training data newclus.reset_index(level=0, inplace=True) # merge the cluster assignment dataframe with the cluster training variable dataframe # by the index variable merged_train=pandas.merge(clus_train, newclus, on='index') merged_train.head(n=100) # cluster frequencies merged_train.cluster.value_counts() """ END multiple steps to merge cluster assignment with clustering variables to examine cluster variable means by cluster """ # FINALLY calculate clustering variable means by cluster clustergrp = merged_train.groupby('cluster').mean() print ("Clustering variable means by cluster") print(clustergrp)


# validate clusters in training data by examining cluster differences in anxiety using ANOVA
# first have to merge GENERAL ANXIETY with clustering variables and cluster assignment data 
consumer_data= data['S9Q1A']
# split ANXIETY data into train and test sets
anxiety_train, anxiety_test = train_test_split(consumer_data, test_size=.3, random_state=123)
anxiety_train1=pandas.DataFrame(anxiety_train)
anxiety_train1.reset_index(level=0, inplace=True)
merged_train_all=pandas.merge(anxiety_train1, merged_train, on='index', suffixes=['', '_'])
sub1 = merged_train_all[['S9Q1A', 'cluster']].dropna()




anxietymod = smf.ols(formula='S9Q1A ~ C(cluster)', data=sub1).fit()
print (anxietymod.summary())

print ('means for CONSUMER by cluster')
m1= sub1.groupby('cluster').mean()
print (m1)

print ('standard deviations for CONSUMER by cluster')
m2= sub1.groupby('cluster').std()
print (m2)

mc1 = multi.MultiComparison(sub1['S9Q1A'], sub1['cluster'])
res1 = mc1.tukeyhsd()
print(res1.summary())

Comments

Popular posts from this blog

Classification in Income per Person

Logistic Regresion in Nicotine Dependence and Alcohol Dependence

Lasso Regression in Income per Person