Clusters in Effects of Nicotine Dependency
Introduction
Nesarc dataset is used for clustering people who are between 18 and 25, having recently alcohol consumption and/or nicotine dependency. The dataset is also included panic disorder and experience depression in life.
Getting Data
Preparing Data
PANIC : All panic symptoms are in the equation in data.
0. Hispanic or Latino
1. White, Not Hispanic or Latino
2. Black, Not Hispanic or Latino
3. American Indian/Alaska Native, Not Hispanic or Latino
4. Asian/Native Hawaiian/Pacific Islander, Not Hispanic or Latino
Plotting Elbow Method
Modelling the Cluster
Cluster Groups
OLS Regression Result
To understand how the clusters differ on GENERAL ANXIETY(S9Q1A) with an analysis of variance, means tables and a tukey test:
The p-value from the analysis of variance is much higher than 0.05, indicates that the differences between clusters was not significant. We can try higher cluster to test the data more.
CODE
# -*- coding: utf-8 -*-
"""
Created on Tue Sep 1 09:54:18 2015
@author: jrose01
"""
import numpy
import pandas
from pandas import Series, DataFrame
import statsmodels.api as sm
import seaborn
import statsmodels.formula.api as smf
import statsmodels.stats.multicomp as multi
import matplotlib.pylab as plt
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.cluster import KMeans
# bug fix for display formats to avoid run time errors
pandas.set_option('display.float_format', lambda x:'%.2f'%x)
data = pandas.read_csv('data/nesarc_pds.csv', low_memory=False)
##############################################################################
# DATA MANAGEMENT
##############################################################################
# Age is below 25
# USUAL FREQUENCY WHEN SMOKED CIGARETTES = Everyday
# Smoked cigarettes in the past 12 months
data=data[(data['AGE']<=25) & (data['CHECK321']==1) & (data['S3AQ3B1']==1) &
(data['IDNUM']!=20346) & (data['IDNUM']!=36471) & (data['IDNUM']!=28724)]
#setting variables you will be working with to numeric
def SetColumns(cols):
for col in cols:
data[col] = pandas.to_numeric(data[col], errors='coerce')
Columns = ['IDNUM','TAB12MDX','CONSUMER','NDSymptoms','MAJORDEPLIFE',
'SOCPDLIFE', 'S3AQ3C1', 'AGE', 'SEX', 'S3AQ3B1','S3AQ8B7J', 'CHECK321',
'S3AQ8B11', 'S3AQ8B12', 'S3AQ8B13', 'IDNUM', 'TAB12MDX']
ColumnsForNicotine = [ 'S3AQ8B7A', 'S3AQ8B7B',
'S3AQ8B7C', 'S3AQ8B7D', 'S3AQ8B7E', 'S3AQ8B7F', 'S3AQ8B7G',
'S3AQ8B7H']
ColumnsForPanicAttack = ['S6Q1','S6Q2','S6Q3','S6Q7','S6Q61',
'S6Q62','S6Q63','S6Q64','S6Q65','S6Q66','S6Q67','S6Q68','S6Q69',
'S6Q610','S6Q611','S6Q612','S6Q613']
SetColumns(Columns)
SetColumns(ColumnsForNicotine)
SetColumns(ColumnsForPanicAttack)
data['S3AQ3C1']=data['S3AQ3C1'].replace(99, numpy.nan)
# Current Tolerance criteria #1 DSM-IV
def crit1 (row):
if row['S3AQ8B11']==1 or row['S3AQ8B12'] == 1 :
return 1
elif row['S3AQ8B11']==2 and row['S3AQ8B12']==2 :
return 0
data['crit1'] = data.apply (lambda row: crit1 (row),axis=1)
chk2 = data['crit1'].value_counts(sort=False, dropna=False)
print (chk2)
chk3 = data['S3AQ8B11'].value_counts(sort=False, dropna=False)
print (chk3)
chk4 = data['S3AQ8B12'].value_counts(sort=False, dropna=False)
print (chk4)
print (pandas.crosstab(data['S3AQ8B11'], data['S3AQ8B12']))
c1 = data['S3AQ8B7J'].value_counts(sort=False, dropna=False)
print (c1)
# check recode
def NicotineRecoding(cols):
recode1 = {1: 1, 2: 0}
for col in cols:
data[col]=data[col].replace(9, numpy.nan)
data[col]= data[col].map(recode1)
NicotineRecoding(ColumnsForNicotine)
# after recoding 9s to missing
chk1c = data['S3AQ8B7J'].value_counts(sort=False, dropna=False)
print (chk1c)
# sum symptoms
data['CWITHDR_COUNT'] = numpy.nansum([data['S3AQ8B7A'], data['S3AQ8B7B'], data['S3AQ8B7C'],
data['S3AQ8B7D'], data['S3AQ8B7E'], data['S3AQ8B7F'],
data['S3AQ8B7G'], data['S3AQ8B7H']], axis=0)
# check to make sure sum code worked
chksum=data[['IDNUM','S3AQ8B7A', 'S3AQ8B7B', 'S3AQ8B7C', 'S3AQ8B7D',
'S3AQ8B7E', 'S3AQ8B7F', 'S3AQ8B7G', 'S3AQ8B7H', 'CWITHDR_COUNT']]
chksum.head(n=50)
chk1d = data['CWITHDR_COUNT'].value_counts(sort=False, dropna=False)
print (chk1d)
# withdrawal (yes/no)
def crit2 (row):
if row['CWITHDR_COUNT']>=4 or row['S3AQ8B7J']==1:
return 1
elif row['CWITHDR_COUNT']<4 and row['S3AQ8B7J']!=1:
return 0
data['crit2'] = data.apply (lambda row: crit2 (row),axis=1)
print (pandas.crosstab(data['CWITHDR_COUNT'], data['crit2']))
#Current Larger amount or longer period criteria #3 DSM-IV
recode1 = {1: 1, 2: 0}
data['S3AQ8B13']=data['S3AQ8B13'].replace(9, numpy.nan)
data['S3AQ8B13']= data['S3AQ8B13'].map(recode1)
chk1d = data['S3AQ8B13'].value_counts(sort=False, dropna=False)
print (chk1d)
#Current Cut down criteria #4 DSM-IV
data['S3AQ8B6'] = pandas.to_numeric(data['S3AQ8B6'], errors='coerce')
data['S3AQ8B1'] = pandas.to_numeric(data['S3AQ8B1'], errors='coerce')
def crit4 (row):
if row['S3AQ8B6']==1 or row['S3AQ8B1'] == 1 :
return 1
elif row['S3AQ8B6']==2 and row['S3AQ8B1']==2 :
return 0
data['crit4'] = data.apply (lambda row: crit4 (row),axis=1)
chk1e = data['crit4'].value_counts(sort=False, dropna=False)
print (chk1e)
#Current Substance activities criteria #5 DSM-IV
data['S3AQ8B5'] = pandas.to_numeric(data['S3AQ8B5'], errors='coerce')
data['S3AQ8B5']=data['S3AQ8B5'].replace(9, numpy.nan)
data['S3AQ8B5']= data['S3AQ8B5'].map(recode1)
chk1f = data['S3AQ8B5'].value_counts(sort=False, dropna=False)
print (chk1f)
#Current Reduce activities criteria #6 DSM-IV
data['S3AQ8B2'] = pandas.to_numeric(data['S3AQ8B2'], errors='coerce')
data['S3AQ8B3'] = pandas.to_numeric(data['S3AQ8B3'], errors='coerce')
def crit6 (row):
if row['S3AQ8B2']==1 or row['S3AQ8B3'] == 1 :
return 1
elif row['S3AQ8B2']==2 and row['S3AQ8B3']==2 :
return 0
data['crit6'] = data.apply (lambda row: crit6 (row),axis=1)
chk1g = data['crit6'].value_counts(sort=False, dropna=False)
print (chk1g)
#Current use continued despite knowledge of physical or psychological problem criteria #7 DSM-IV
data['S3AQ8B4'] = pandas.to_numeric(data['S3AQ8B4'], errors='coerce')
data['S3AQ8B14'] = pandas.to_numeric(data['S3AQ8B14'], errors='coerce')
def crit7 (row):
if row['S3AQ8B4']==1 or row['S3AQ8B14'] == 1 :
return 1
elif row['S3AQ8B4']==2 and row['S3AQ8B14']==2 :
return 0
data['crit7'] = data.apply (lambda row: crit7 (row),axis=1)
chk1h = data['crit7'].value_counts(sort=False, dropna=False)
print (chk1h)
# sum all symptoms (np.nansum allows rows with some missing values to count all valid values)
data['NDSymptoms'] = numpy.nansum([data['crit1'], data['crit2'], data['S3AQ8B13'],
data['crit4'], data['S3AQ8B5'], data['crit6'],
data['crit7']], axis=0 )
chk2 = data['NDSymptoms'].value_counts(sort=False, dropna=False)
print (chk2)
c1 = data["MAJORDEPLIFE"].value_counts(sort=False, dropna=False)
print(c1)
c2 = data["AGE"].value_counts(sort=False, dropna=False)
print(c2)
# binary nicotine dependence
def NICOTINEDEP (x):
if x['TAB12MDX']==1:
return 1
else:
return 0
data['NICOTINEDEP'] = data.apply (lambda x: NICOTINEDEP (x), axis=1)
print (pandas.crosstab(data['TAB12MDX'], data['NICOTINEDEP']))
# rename variables
data.rename(columns={'S3AQ3C1': 'NUMBERCIGSMOKED'}, inplace=True)
c6 = data["NUMBERCIGSMOKED"].value_counts(sort=False, dropna=False)
print(c6)
def PANIC (x1):
if ((x1['S6Q1']==1 and x1['S6Q2']==1) or (x1['S6Q2']==1 and x1['S6Q3']==1) or
(x1['S6Q3']==1 and x1['S6Q61']==1) or (x1['S6Q61']==1 and x1['S6Q62']==1) or
(x1['S6Q62']==1 and x1['S6Q63']==1) or (x1['S6Q63']==1 and x1['S6Q64']==1) or
(x1['S6Q64']==1 and x1['S6Q65']==1) or (x1['S6Q65']==1 and x1['S6Q66']==1) or
(x1['S6Q66']==1 and x1['S6Q67']==1) or (x1['S6Q67']==1 and x1['S6Q68']==1) or
(x1['S6Q68']==1 and x1['S6Q69']==1) or (x1['S6Q69']==1 and x1['S6Q610']==1) or
(x1['S6Q610']==1 and x1['S6Q611']==1) or (x1['S6Q611']==1 and x1['S6Q612']==1) or
(x1['S6Q612']==1 and x1['S6Q613']==1) or (x1['S6Q613']==1 and x1['S6Q7']==1) or
x1['S6Q7']==1):
return 1
else:
return 0
data['PANIC'] = data.apply (lambda x1: PANIC (x1), axis=1)
c7 = data["PANIC"].value_counts(sort=False, dropna=False)
print(c7)
# 4 category ethnicity variable
data['ETHRACE2A'] = pandas.to_numeric(data['ETHRACE2A'], errors='coerce')
recode2 = {1: 1, 2: 2, 3: 3, 4: 3, 5: 0}
data['ETHRACE2A'] = data['ETHRACE2A'].replace(9, numpy.nan)
data['ETHRACE'] = data['ETHRACE2A'].map(recode2)
c8 = data["ETHRACE2A"].value_counts(sort=False, dropna=False)
print(c8)
c9 = data["ETHRACE"].value_counts(sort=False, dropna=False)
print(c9)
##############################################################################
# END DATA MANAGEMENT
##############################################################################
##############################################################################
# MODELING AND TRAINNING
##############################################################################
cluster_column = ['AGE', 'SEX', 'MAJORDEPLIFE', 'CONSUMER', 'CWITHDR_COUNT', 'NDSymptoms',
'MAJORDEPLIFE', 'NICOTINEDEP', 'NUMBERCIGSMOKED', 'PANIC', 'ETHRACE']
def SetFloat(cols):
for col in cols:
data[col]=preprocessing.scale(data[col].astype('float64'))
SetFloat(cluster_column)
cluster_data = data[['AGE', 'SEX', 'MAJORDEPLIFE', 'CONSUMER', 'CWITHDR_COUNT', 'NDSymptoms',
'MAJORDEPLIFE', 'NICOTINEDEP', 'NUMBERCIGSMOKED', 'PANIC', 'ETHRACE']]
cluster_data = cluster_data.dropna()
# split data into train and test sets
clus_train, clus_test = train_test_split(cluster_data, test_size=.3, random_state=123)
# k-means cluster analysis for 1-9 clusters
from scipy.spatial.distance import cdist
clusters=range(1,10)
meandist=[]
for k in clusters:
model=KMeans(n_clusters=k)
model.fit(clus_train)
clusassign=model.predict(clus_train)
meandist.append(sum(numpy.min(cdist(clus_train, model.cluster_centers_, 'euclidean'), axis=1))
/ clus_train.shape[0])
"""
Plot average distance from observations from the cluster centroid
to use the Elbow Method to identify number of clusters to choose
"""
plt.plot(clusters, meandist)
plt.xlabel('Number of clusters')
plt.ylabel('Average distance')
plt.title('Selecting k with the Elbow Method')
plt.show()
# Interpret 3 cluster solution
model3=KMeans(n_clusters=3)
model3.fit(clus_train)
clusassign=model3.predict(clus_train)
# plot clusters
from sklearn.decomposition import PCA
pca_2 = PCA(2)
plot_columns = pca_2.fit_transform(clus_train)
plt.scatter(x=plot_columns[:,0], y=plot_columns[:,1], c=model3.labels_,)
plt.xlabel('Canonical variable 1')
plt.ylabel('Canonical variable 2')
plt.title('Scatterplot of Canonical Variables for 3 Clusters')
plt.show()
"""
BEGIN multiple steps to merge cluster assignment with clustering variables to examine
cluster variable means by cluster
"""
# create a unique identifier variable from the index for the
# cluster training data to merge with the cluster assignment variable
clus_train.reset_index(level=0, inplace=True)
# create a list that has the new index variable
cluslist=list(clus_train['index'])
# create a list of cluster assignments
labels=list(model3.labels_)
# combine index variable list with cluster assignment list into a dictionary
newlist=dict(zip(cluslist, labels))
newlist
# convert newlist dictionary to a dataframe
newclus=DataFrame.from_dict(newlist, orient='index')
newclus
# rename the cluster assignment column
newclus.columns = ['cluster']
# now do the same for the cluster assignment variable
# create a unique identifier variable from the index for the
# cluster assignment dataframe
# to merge with cluster training data
newclus.reset_index(level=0, inplace=True)
# merge the cluster assignment dataframe with the cluster training variable dataframe
# by the index variable
merged_train=pandas.merge(clus_train, newclus, on='index')
merged_train.head(n=100)
# cluster frequencies
merged_train.cluster.value_counts()
"""
END multiple steps to merge cluster assignment with clustering variables to examine
cluster variable means by cluster
"""
# FINALLY calculate clustering variable means by cluster
clustergrp = merged_train.groupby('cluster').mean()
print ("Clustering variable means by cluster")
print(clustergrp)
# validate clusters in training data by examining cluster differences in anxiety using ANOVA
# first have to merge GENERAL ANXIETY with clustering variables and cluster assignment data
consumer_data= data['S9Q1A']
# split ANXIETY data into train and test sets
anxiety_train, anxiety_test = train_test_split(consumer_data, test_size=.3, random_state=123)
anxiety_train1=pandas.DataFrame(anxiety_train)
anxiety_train1.reset_index(level=0, inplace=True)
merged_train_all=pandas.merge(anxiety_train1, merged_train, on='index', suffixes=['', '_'])
sub1 = merged_train_all[['S9Q1A', 'cluster']].dropna()
anxietymod = smf.ols(formula='S9Q1A ~ C(cluster)', data=sub1).fit()
print (anxietymod.summary())
print ('means for CONSUMER by cluster')
m1= sub1.groupby('cluster').mean()
print (m1)
print ('standard deviations for CONSUMER by cluster')
m2= sub1.groupby('cluster').std()
print (m2)
mc1 = multi.MultiComparison(sub1['S9Q1A'], sub1['cluster'])
res1 = mc1.tukeyhsd()
print(res1.summary())











Comments
Post a Comment