/*---------------------STRUCTDPM.c -----------------------------------/ /* Implements Hierarchical Dirichlet Process Mixture Model /* for Genetic Association Studies of Quantitative Traits in the /* Presence of Population Stratification /* Model described in: /* NM Pajewski and PW Laud. A flexible Bayesian Semiparametric /* Model for Association Studies of Quantitative Traits in the /* Presence of Population Structure (2008) submitted. /* /* Author: Nicholas M. Pajewski /* Updated: 3/24/2008 /* Documentation file: STRUCTDPM_readme.pdf /* For questions or to report bugs, please contact npajewsk@mcw.edu /*------------------------------------------------------------------*/ #include #include #include #include #include #include #include #include #include #include "/home/npajewsk/Dissertation/cc_DPM/arms.h" #include "/home/npajewsk/Dissertation/cc_DPM/arms.c" #include #include /*------------------------------------------------------------*/ /* Data Structure */ /*------------------------------------------------------------*/ typedef struct genetic_data{ int num_parm; int finite_limit; /* Limit for approximation to Dirichlet Process */ double q[51]; /* Vector of stick-breaking weights for H */ double a_q[51]; /* parameters of stick-breaking measure */ double b_q[51]; /* parameters of stick-breaking measure */ double theta[51][703]; /* Collection of logit allele frequencies */ double SNP_prior[2]; /* Prior parameters on logit allele frequencies */ double clust_allele_freq[51][703][2]; /* Counts het/homo at l for each cluster */ int samp_size; /* Total number of subjects */ int total_loci; /* Total number of loci */ int genotypes[900][703][2]; /* Observed Genotype Data */ double gen_profile[900]; /* Sum of genetic effects for each individual */ double phenotype[900]; /* Observed Continuous Phenotypes */ double beta_ge[51][2]; /* Genetic Effects at each locus */ double beta0[51]; /* Intercept in each cluster */ double tau_epsilon; /* Error Precision: Common to each cluster */ double mu_b0; /* Prior mean for intercept */ double tau_b0; /* Prior precision for intercept */ double pi; /* weight on zero cluster in DP process */ double alpha_s; /* Prior parameters for Pi */ double beta_s; double alpha_tau; /* Prior for phenotypic precision */ double lambda_tau; /* Prior for phenotypic precision */ int distinctH[51]; int MH_statistics[2]; }GENETIC_DATA; /*--------------------------------------------------------------------------------------*/ /* Structure to hold necessary quantities to update marker allele frequencies /* using Adaptive Rejection Sampling (ARS) /*--------------------------------------------------------------------------------------*/ typedef struct theta_arms{ int n_j; /* Number of Subjects in Cluster */ int homo_sum; /* # of Individuals Homozygous at Locus in Cluster */ int hetero_sum; /* # of Heterozygous Individuals at Locus in Cluster */ double mu_theta, tau_theta; /* Normal Prior Parameters for Allele Frequency */ }THETA_ARMS; /*--------------------------------------------------------------------------------------*/ /* Function for ARMS to evaluate theta[j]'s unnormalized log-density for /* candidate and marker positions /*--------------------------------------------------------------------------------------*/ double theta_density(double x, void *data){ struct theta_arms *d; double value; d=data; value=x*(2.0*(d->homo_sum)+(d->hetero_sum))-2.0*(d->n_j)*log(1.0+exp(x)) -(d->tau_theta/2.0)*pow(x-(d->mu_theta),2.0); return(value); } /*-------------------------------------------------------------------------------------*/ /* Function to Sample from baseline measure G_0: Normal Priors on SNP Allele frequencies /*-------------------------------------------------------------------------------------*/ void baseline_G0(gsl_rng *r, int j, void *data){ struct genetic_data *d; int l; double mu, sigma; d=data; mu=d->SNP_prior[0]; sigma=1.0/(sqrt(d->SNP_prior[1])); /************ Step 1: Draw logit SNP allele frequencies ************/ for(l=0;ltotal_loci;l++){ d->theta[j][l]=gsl_ran_gaussian(r,sigma)+mu; } /************ Step 2: Draw BETA0j **********************/ sigma=1.0/sqrt(d->tau_b0); d->beta0[j]=gsl_ran_gaussian(r,sigma)+(d->mu_b0); } /*--------------------------------------------------------------------------------*/ /* Function to evaluate log-likelihood for G /*--------------------------------------------------------------------------------*/ double log_likelihood_G(int i, int j, void *data){ struct genetic_data *d; int l; double diff; double log_like=0.0; d=data; /********** Genetic Marker Contribution *********************/ for(l=0;ltotal_loci;l++){ log_like+=(d->genotypes[i][l][0])*log(2.0) +(d->theta[j][l])*(2.0*(d->genotypes[i][l][1])+(d->genotypes[i][l][0])) -2.0*log(1.0+exp(d->theta[j][l])); } /********** Regression Contribution ************************/ diff=d->phenotype[i]-(d->beta0[j])-(d->gen_profile[i]); log_like-=((d->tau_epsilon)/2.0)*pow(diff,2.0); return(log_like); } /*--------------------------------------------------------------------------------------*/ /* Bookkeeping function for update of configuration for G /* Involves relabeling n_phi and s such that s[i]=k /* Needed when n_phi[s[i]]=1, i.e. possibility of making empty cluster /*--------------------------------------------------------------------------------------*/ void relabel(int i, int k, int N_ind, int total_loci, int n_phi[N_ind], int s[N_ind], void *data){ struct genetic_data *d; int j,n,m,p,x,y,z; double store_theta[total_loci]; int store_n_phi, store_s_i; int store_hetero_sum[total_loci]; int store_homo_sum[total_loci]; double store_beta_0; d=data; if(s[i]!=(k-1)){ /* No need to relabel */ /* Store Values for Cluster s[i] */ for(x=0;xclust_allele_freq[s[i]][x][0]; store_homo_sum[x]=d->clust_allele_freq[s[i]][x][1]; store_theta[x]=d->theta[s[i]][x]; } store_s_i=s[i]; store_n_phi=n_phi[s[i]]; store_beta_0=d->beta0[s[i]]; /* Transfer Values for kth cluster to cluster s[i] */ for(y=0;yclust_allele_freq[s[i]][y][0]=d->clust_allele_freq[k-1][y][0]; d->clust_allele_freq[s[i]][y][1]=d->clust_allele_freq[k-1][y][1]; d->theta[s[i]][y]=d->theta[k-1][y]; } n_phi[s[i]]=n_phi[k-1]; d->beta0[s[i]]=d->beta0[k-1]; /* Transfer Stored Values to Cluster k */ for(z=0;zclust_allele_freq[k-1][z][0]=store_hetero_sum[z]; d->clust_allele_freq[k-1][z][1]=store_homo_sum[z]; d->theta[k-1][z]=store_theta[z]; } d->beta0[k-1]=store_beta_0; n_phi[k-1]=store_n_phi; for(j=0;j1){ n_phi[s[i]]--; c=0; move_prob=(double) n_phi[c]/(N_ind-1.0+mass_alpha); U1=gsl_ran_flat(r,0,1); do{ if(U1<=move_prob){ sample++; }else{ c++; if(cMH_statistics[0]++; if(U2<=MH_prob){ /* Accept proposed move to c */ v->MH_statistics[1]++; for(m=0;mtotal_loci;m++){ v->clust_allele_freq[s[i]][m][0]-=v->genotypes[i][m][0]; v->clust_allele_freq[s[i]][m][1]-=v->genotypes[i][m][1]; v->clust_allele_freq[c][m][0]+=v->genotypes[i][m][0]; v->clust_allele_freq[c][m][1]+=v->genotypes[i][m][1]; } s[i]=c; n_phi[s[i]]++; if(c==k){ k++; } }else{ n_phi[s[i]]++; } /******* i: Singleton in cluster s[i] **************/ }else if(n_phi[s[i]]==1){ relabel(i, k, N_ind, total_loci, n_phi, s, v); n_phi[k-1]=0; c=0; move_prob=(double) n_phi[c]/(N_ind-1.0+mass_alpha); U1=gsl_ran_flat(r,0,1); do{ if(U1<=move_prob){ sample++; }else{ c++; if(c<(k-1)){ move_prob+=n_phi[c]/(N_ind-1.0+mass_alpha); }else{ move_prob+=mass_alpha/(N_ind-1.0+mass_alpha); }} }while(sample<1); if(c==(k-1)){ baseline_G0(r, c, v); } /* Compute Metropolis Acceptance Ratio */ log_l_up=log_likelihood_G(i, c, v); log_l_low=log_likelihood_G(i,s[i],v); log_like_ratio=log_l_up-log_l_low; like_ratio=exp(log_like_ratio); if(like_ratio<1.0){ MH_prob=like_ratio; }else{ MH_prob=1.0; } U2=gsl_ran_flat(r,0,1); v->MH_statistics[0]++; if(U2<=MH_prob){ v->MH_statistics[1]++; for(m=0;mclust_allele_freq[s[i]][m][0]-=v->genotypes[i][m][0]; v->clust_allele_freq[s[i]][m][1]-=v->genotypes[i][m][1]; v->clust_allele_freq[c][m][0]+=v->genotypes[i][m][0]; v->clust_allele_freq[c][m][1]+=v->genotypes[i][m][1]; } s[i]=c; n_phi[s[i]]++; if(c<(k-1)){ k--; } }else{ n_phi[s[i]]++; } } } return(k); } /*------------------------------------------------------------*/ /* Function to track pairwise group membership to compute /* Bayes Factors ala Huelsenbeck & Andolfatto (2007) /*------------------------------------------------------------*/ void Bayes_Factor_Population(int N_ind, int s[N_ind], int pairwise_group[N_ind][N_ind]){ int i,j; for(i=0;ibeta){ d1=alpha; d2=beta; }else{ d1=beta; d2=alpha; } d3=d1+d2; d4=1.0/d2; d5=1.0+d1-d2; d6=(d5*(0.0138888889+0.0416666667*d2))/(d1*d4-0.777777778); d7=0.25+(0.5+(0.25/d5))*d2; accept=0; do{ U1=gsl_ran_flat(r,0.0,1.0); U2=gsl_ran_flat(r,0.0,1.0); fast_reject=0; /* Fast rejection step */ if(U1<0.5){ Y1=U1*U2; Y2=U1*Y1; fr_check=0.25*U2+Y2-Y1; if(fr_check>=d6){ fast_reject=1; } }else{ Y2=U1*U1*U2; if(Y2<=0.25){ accept=1; V=d4*log(U1/(1.0-U1)); W=d1*exp(V); } if(Y2>=d7){ fast_reject=1; } } if(fast_reject==0){ V=d2*log(U1/(1.0-U1)); W=d1*exp(V); ln_Y2=log(Y2); acc_check=d3*(log(d3/(d2+W))+V)-1.38629436; if(acc_check>=ln_Y2){ accept=1; } } }while(accept<1); if(alpha>beta){ Z=W/(d2+W); }else{ Z=d2/(d2+W); } return(Z); } /*--------------------------------------------------------------------------------------*/ /* Function to sample Beta0j from G /*--------------------------------------------------------------------------------------*/ void gibbs_beta0j(gsl_rng *r, void *data, int j, int N_ind, int s[N_ind], int n_phi[N_ind], gsl_matrix *B0Z){ int i,c; double phengen, cond_mean, cond_pre, cond_sigma; struct genetic_data *d; d=data; /********* Step: Simulate BETA0j ****************/ gsl_matrix *YMXBETA=gsl_matrix_calloc(n_phi[j],1); c=0; for(i=0;isamp_size;i++){ if(s[i]==j){ phengen=d->phenotype[i]-d->gen_profile[i]; gsl_matrix_set(YMXBETA,c,0,phengen); c++; } } gsl_matrix *SUM=gsl_matrix_calloc(n_phi[j],1); gsl_matrix_set_all (SUM, 1.0); gsl_matrix *YPY=gsl_matrix_calloc(1,1); gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0,YMXBETA, SUM, 0.0,YPY); cond_pre=d->tau_b0+(n_phi[j])*(d->tau_epsilon); cond_mean=((d->tau_epsilon)*(gsl_matrix_get(YPY,0,0))+(d->tau_b0)*(d->mu_b0))/cond_pre; cond_sigma=1.0/sqrt(cond_pre); /* Simulate new beta_0 */ d->beta0[j]=cond_sigma*gsl_ran_gaussian(r,1.0)+cond_mean; gsl_matrix_free(YMXBETA); gsl_matrix_free(YPY); gsl_matrix_free(SUM); /************** Step 2: Update BOZ vector *************/ for(i=0;isamp_size;i++){ if(s[i]==j){ gsl_matrix_set(B0Z,i,0,d->beta0[j]); } } } /*--------------------------------------------------------------------------------------*/ /* Function to sample Tau_epsilon /*--------------------------------------------------------------------------------------*/ void gibbs_tau_epsilon(gsl_rng *r, void *data, gsl_matrix *B0Z, gsl_matrix *Y, gsl_matrix *NZXB){ struct genetic_data *d; double alpha_star, lambda_star, beta_star; d=data; /* Matrix calculations for lambda_star */ gsl_matrix *YMXBETA=gsl_matrix_calloc(d->samp_size,1); gsl_matrix_memcpy (YMXBETA, Y); gsl_matrix_add(NZXB,B0Z); gsl_matrix_sub(YMXBETA,NZXB); gsl_matrix *YPY=gsl_matrix_calloc(1,1); gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, YMXBETA, YMXBETA, 0.0, YPY); gsl_matrix_free(YMXBETA); lambda_star=d->lambda_tau+0.5*(gsl_matrix_get(YPY,0,0)); gsl_matrix_free(YPY); alpha_star=d->alpha_tau+(d->samp_size/2.0); beta_star=1.0/lambda_star; d->tau_epsilon=beta_star*gsl_ran_gamma(r,alpha_star,1.0); } /*------------------------------------------------------------*/ /* Function to Update Stick-Breaking Weights */ /*------------------------------------------------------------*/ void stick_weight_new(gsl_rng *r, double mass_alpha, void *data, int approx_limit, int n_phi_ge[approx_limit]){ struct genetic_data *d; int j; double alpha_p, beta_p; int M_k; double left_of_stick; double alpha_pi, beta_pi; double nz_pt; double V_K; d=data; /* First update pi, probability of falling in zero effect cluster */ alpha_pi=d->alpha_s+n_phi_ge[0]; beta_pi=d->beta_s+(d->total_loci-n_phi_ge[0]); d->pi=gsl_ran_beta(r,alpha_pi,beta_pi); d->q[0]=d->pi; /* Initialize M_k */ left_of_stick=1.0; M_k=d->total_loci-n_phi_ge[0]; /* Compute stick breaking weights on log scale */ for(j=1;j<(d->finite_limit-1);j++){ nz_pt=d->finite_limit-1.0; d->a_q[j]=mass_alpha/(nz_pt); d->b_q[j]=(mass_alpha*(nz_pt-j)/nz_pt); M_k-=n_phi_ge[j]; alpha_p=n_phi_ge[j]+(d->a_q[j]); beta_p=(d->b_q[j])+M_k; if(alpha_p>=1.0 & beta_p>=1.0){ V_K=gsl_ran_beta(r,alpha_p,beta_p); }else{ V_K=cheng_ran_beta(r,alpha_p, beta_p); } d->q[j]=(1.0-d->pi)*left_of_stick*V_K; left_of_stick=left_of_stick*(1.0-V_K); } d->q[d->finite_limit-1]=(1.0-d->pi)*left_of_stick; } /*------------------------------------------------------------------------------*/ /* Function to Sample from baseline measure G_0: Bivariate Normal Prior on /* Genetic Effects /* SIGMA2=Lower Triangular Cholesky Decomposition of Covariance Matrix /*------------------------------------------------------------------------------*/ void baseline_H0( gsl_rng *r, int j, gsl_matrix *MU, gsl_matrix *SIGMA2, void *data){ struct genetic_data *d; double sigma1, sigma2, mu1, mu2; d=data; sigma1=sqrt(gsl_matrix_get(SIGMA2,0,0)); mu1=gsl_matrix_get(MU,0,0); sigma2=sqrt(gsl_matrix_get(SIGMA2,1,1)); mu2=gsl_matrix_get(MU,1,0); d->beta_ge[j][0]=sigma1*(gsl_ran_gaussian(r,1.0))+mu1; d->beta_ge[j][1]=sigma2*(gsl_ran_gaussian(r,1.0))+mu2; } /*--------------------------------------------------------------------------------------*/ /* Function to sample alpha (Mass Parameter) for Dirichlet Process on Genetic /* effects - From West (1992) /*--------------------------------------------------------------------------------------*/ double mass_samp(gsl_rng *r, int Num, double alpha, int k, double a_0, double b_0, void *data){ struct genetic_data *d; double x; double alpha_pr=alpha+1; double mix_prob; double pi_x, beta_new; int alpha_new; d=data; x=gsl_ran_beta (r, alpha_pr, Num); /* Step 1 */ mix_prob=gsl_ran_flat(r,0,1); /* Step II */ pi_x=(a_0+k-1)/(Num*(b_0-log(x))+(a_0+k-1)); /**CHECK THIS FORMULA ****/ if(mix_prob<=pi_x){ alpha_new=a_0+k; beta_new=b_0-log(x); alpha=gsl_ran_gamma (r, alpha_new, beta_new); return(alpha); }else{ alpha_new=a_0+k-1; beta_new=b_0-log(x); alpha=gsl_ran_gamma (r, alpha_new, beta_new); return(alpha); } } /*------------------------------------------------------------*/ /* Function to evaluate log-likelihood function for H */ /* Unnormalized normal density function /*------------------------------------------------------------*/ double log_likelihood(int l, int j, gsl_matrix *Y, gsl_matrix *XBETA, gsl_matrix *NZXB, gsl_matrix *B0Z, void *data){ struct genetic_data *v; double log_like; int n; v=data; /* Matrix Calculations to Compute Likelihood */ gsl_matrix *YMXBETA=gsl_matrix_calloc(v->samp_size,1); gsl_matrix_memcpy (YMXBETA, Y); gsl_matrix_sub(YMXBETA,XBETA); gsl_matrix_sub(YMXBETA,B0Z); gsl_matrix_sub(YMXBETA,NZXB); gsl_matrix *LIKE=gsl_matrix_calloc(1,1); gsl_blas_dgemm(CblasTrans,CblasNoTrans, 1.0, YMXBETA, YMXBETA, 0.0, LIKE); gsl_matrix_free(YMXBETA); gsl_matrix_scale(LIKE,v->tau_epsilon); log_like=-(0.5)*gsl_matrix_get(LIKE,0,0); gsl_matrix_free(LIKE); return(log_like); } /*------------------------------------------------------------*/ /* Function to sample s_i using Blocked Gibbs Update of */ /* Ishwaran and James (JASA 2001) */ /*------------------------------------------------------------*/ int blocked_gibbs_config(gsl_rng *r, int l, int approx_limit, void *data, gsl_matrix *Y, gsl_matrix *B0Z, int num_loci, int configuration[approx_limit][num_loci], int z[num_loci], int index[num_loci], int n_phi_ge[approx_limit]){ struct genetic_data *d; int j,h,b,n,f,a,v; int sample; double atom_probs[approx_limit]; double weight_probs[approx_limit]; double norm_constant=0.0; double max_ll; double pt_mass_prob; double U; int locus; int temp_s; double l_effect; double temp_gen; d=data; gsl_matrix *XBETA=gsl_matrix_calloc(d->samp_size,1); gsl_matrix *NZXB=gsl_matrix_calloc(d->samp_size,1); /* Subtract current locus effect out of gen_profile */ if(z[l]!=0){ /* Current loci had an effect in previous iteration */ for(n=0;nsamp_size;n++){ temp_gen=d->gen_profile[n]; d->gen_profile[n]=temp_gen-(d->beta_ge[z[l]][0])*(d->genotypes[n][l][0])- (d->beta_ge[z[l]][1])*(d->genotypes[n][l][1]); gsl_matrix_set(NZXB,n,0,d->gen_profile[n]); } }else{ for(n=0;nsamp_size;n++){ gsl_matrix_set(NZXB,n,0,d->gen_profile[n]); } } /* Compute total probability to get normalization constant based on likelihood and stick-breaking weights. Involves computing things on log-likelihood scale and shifting to get numerical stability */ for(j=0;jsamp_size;v++){ l_effect=(d->beta_ge[j][0])*(d->genotypes[v][l][0]) +(d->beta_ge[j][1])*(d->genotypes[v][l][1]); gsl_matrix_set(XBETA,v,0,l_effect); } atom_probs[j]=log(d->q[j])+log_likelihood(l, j, Y, XBETA, NZXB, B0Z,d); if(j==0){ max_ll=atom_probs[j]; } if(atom_probs[j]>max_ll){ max_ll=atom_probs[j]; } } gsl_matrix_free(XBETA); gsl_matrix_free(NZXB); for(b=0;bsamp_size;n++){ temp_gen=d->gen_profile[n]; d->gen_profile[n]=temp_gen+(d->beta_ge[temp_s][0])*(d->genotypes[n][l][0])+ (d->beta_ge[temp_s][1])*(d->genotypes[n][l][1]); } } return(temp_s); } /*--------------------------------------------------------------------------------*/ /* DP Parameters: Outputs three quantities at each locus /* I.) Indicator of whether locus effect is non-zero (File=ge_zero_post) /* II.) Estimated Genetic effect (File=ge_post) /* III.) Predictive sample of logit allele frequency (File=out_allele) /*--------------------------------------------------------------------------------*/ void post_pred(gsl_rng *r, FILE *ge_zero_post, FILE *ge_post, FILE *out_allele, void *data, int K_G, double massG, int K_H, double massH, int g, int burn_in, int num_loci, int N_ind, int z[num_loci], int s[N_ind], int n_phi[N_ind], int H_switch){ int l,j,q,h=0,p,m; int zero_ind, sample, atom; double U; double prob_atom; double tot_prob=0.0; double choice_prob, sum_prob; double theta_tilda_g[num_loci]; int done=0; double beta_tilda_g; struct genetic_data *d; d=data; if(g>burn_in){ /* Output quantities related to G: DP on allele frequencies */ for(j=0;jtheta[h][l]; } beta_tilda_g=d->beta0[h]; }else if(h<(K_G-1)){ h++; sum_prob=sum_prob+(n_phi[h]/tot_prob); }else{ baseline_G0(r, K_G, d); for(l=0;ltheta[K_G][l]; } beta_tilda_g=d->beta0[K_G]; done++; } }while(done<1); /* Given chosen atom: Output allele frequencies to file:out_allele */ for(l=0;ltotal_loci;l++){ fprintf(out_allele,"%d %d %lf\n",g,l,theta_tilda_g[l]); } if(H_switch==1){ /* Output Quantities for Genetic Effects if past burn_in */ for(l=0;ltotal_loci;l++){ if(z[l]==0){ zero_ind=0; }else{ zero_ind=1; } fprintf(ge_zero_post,"%d %d %d\n",g,l,zero_ind); fprintf(ge_post,"%d %d %d %lf\n",g,l,1,d->beta_ge[z[l]][0]); fprintf(ge_post,"%d %d %d %lf\n",g,l,2,d->beta_ge[z[l]][1]); } } } } /*--------------------------------------------------------------------------------*/ /* Function To Initialize Gibbs Sequence /*--------------------------------------------------------------------------------*/ void initial_samp(gsl_rng *r, void *data, double massG, double massH, int approx_limit, int num_loci, int configuration[approx_limit][num_loci], int z_ind[num_loci], int index[num_loci], int n_phi_ge[approx_limit], int N_ind, int s[N_ind], int n_phi[approx_limit], int init_clus, gsl_matrix *B0Z){ int i,l,j; double ran_num; double dummy; double left_of_stick=1.0; double M_k; double alpha_p, beta_p; int temp, clus; double temp2; int nz_pt; double V_K; struct genetic_data *d; d=data; /************ Initialize parameters related to H **********************/ d->tau_epsilon=0.01; /* Randomly assign each loci to initial zero and non-zero clusters */ d->pi=0.98; for(i=0;itotal_loci;i++){ z_ind[i]=0; temp=n_phi_ge[z_ind[i]]; configuration[z_ind[i]][temp]=i; index[i]=n_phi_ge[0]; n_phi_ge[z_ind[i]]++; } for(i=0;isamp_size;i++){ d->gen_profile[i]=0.0; } d->q[0]=d->pi; /* Initialize M_k */ left_of_stick=1.0; M_k=d->total_loci-(n_phi_ge[0]); for(j=1;j<(d->finite_limit-1);j++){ nz_pt=d->finite_limit-1; d->a_q[j]=massH/(nz_pt); d->b_q[j]=(massH*(nz_pt-j))/(nz_pt); alpha_p=d->a_q[j]; beta_p=d->b_q[j]; d->q[j]=(1.0-d->pi)*left_of_stick*(cheng_ran_beta(r,alpha_p,beta_p)); left_of_stick=left_of_stick*(1.0-(d->q[j])); } d->q[d->finite_limit-1]=left_of_stick*(1.0-d->pi); /************** Initialize parameters related to G *******************/ /* Initialize configuration */ for(i=0;isamp_size;i++){ temp2=(init_clus)*gsl_ran_flat(r,0.0,1.0); clus=(int) temp2; s[i]=clus; n_phi[s[i]]++; for(l=0;ltotal_loci;l++){ d->clust_allele_freq[clus][l][0]+=d->genotypes[i][l][0]; d->clust_allele_freq[clus][l][1]+=d->genotypes[i][l][1]; } /* Setup Beta0j matrix */ gsl_matrix_set(B0Z,i,0,d->beta0[clus]); } } /*--------------------------------------------------------------------------------*/ /* Function to perform update of genetic effects /* Bivariate normal update of beta1j and beta2j /*--------------------------------------------------------------------------------*/ void gibbs_genetic(gsl_rng *r, int j, void *data, gsl_matrix *Y, gsl_matrix *TPHI, gsl_matrix *T, gsl_matrix *B0Z, int approx_limit, int num_loci, int configuration[approx_limit][num_loci], int n_phi_ge[approx_limit]){ struct genetic_data *d; int f,l,c,a,m,v,n; int counter2; double fuzz=0.0001; double temp; double temp_gen; int het_sum, homo_sum; d=data; /* Step 1: Create data vectors for clusters of genetic effects */ gsl_matrix *X=gsl_matrix_calloc(d->samp_size,2); gsl_matrix *ZBETA=gsl_matrix_calloc(d->samp_size,1); gsl_matrix *TEMPY=gsl_matrix_calloc(d->samp_size,1); counter2=0; /* Setup data matrix for current cluster and other genetic effects */ for(a=0;asamp_size;a++){ het_sum=0; homo_sum=0; for(v=0;vgenotypes[a][configuration[j][v]][0]; homo_sum+=d->genotypes[a][configuration[j][v]][1]; } temp_gen=d->gen_profile[a]-((d->beta_ge[j][0])*het_sum+(d->beta_ge[j][1])*homo_sum); gsl_matrix_set(ZBETA,a,0,temp_gen); d->gen_profile[a]=temp_gen; gsl_matrix_set(X,a,0,het_sum); gsl_matrix_set(X,a,1,homo_sum); } /****** Matrix Calculations for Precision Matrix **********/ gsl_matrix *XPTX=gsl_matrix_calloc(2,2); gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, X, X, 0.0, XPTX); gsl_matrix_scale(XPTX,d->tau_epsilon); gsl_matrix_add(XPTX,T); gsl_matrix *SIGMA=gsl_matrix_calloc(d->num_parm,d->num_parm); matrix_inv_svd(XPTX,SIGMA,d->num_parm,fuzz); /********** Computing Mean Matrix ********************/ gsl_matrix *XPY=gsl_matrix_calloc(d->num_parm,1); gsl_matrix_memcpy(TEMPY,Y); gsl_matrix_sub(TEMPY,ZBETA); gsl_matrix_sub(TEMPY,B0Z); gsl_matrix_free(ZBETA); gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, X, TEMPY, 0.0, XPY); gsl_matrix_free(X); gsl_matrix_free(TEMPY); gsl_matrix *PHISTAR=gsl_matrix_calloc(d->num_parm,1); gsl_matrix_scale(XPY,d->tau_epsilon); gsl_matrix_add(XPY,TPHI); gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, SIGMA, XPY, 0.0, PHISTAR); gsl_matrix_free(XPY); gsl_matrix_free(XPTX); /* Simulate Independent Vector of Normal Deviates */ gsl_matrix *NORM=gsl_matrix_calloc(d->num_parm,1); for(f=0;fnum_parm;f++){ temp=gsl_ran_gaussian(r,1.0); gsl_matrix_set(NORM,f,0,temp); } /* Compute Cholesky Decomposition of Sigma */ gsl_linalg_cholesky_decomp(SIGMA); /* Create Lower Triangular Matrix */ for(c=0;cnum_parm;c++){ for(a=c+1;anum_parm;a++){ gsl_matrix_set(SIGMA,c,a,0.0); } } /* Transform Normal Deviates to get desired covariance structure */ gsl_matrix *NORM2=gsl_matrix_calloc(d->num_parm,1); gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, SIGMA, NORM, 0.0, NORM2); gsl_matrix_add(NORM2,PHISTAR); gsl_matrix_free(PHISTAR); gsl_matrix_free(NORM); gsl_matrix_free(SIGMA); /* Send sampled genetic effects to data structure */ for(c=0;cnum_parm;c++){ d->beta_ge[j][c]=gsl_matrix_get(NORM2,c,0); } gsl_matrix_free(NORM2); /* Update genetic profile */ for(m=0;msamp_size;m++){ temp_gen=d->gen_profile[m]; for(l=0;lbeta_ge[j][0])*(d->genotypes[m][configuration[j][l]][0])+ (d->beta_ge[j][1])*(d->genotypes[m][configuration[j][l]][1]); } d->gen_profile[m]=temp_gen; } } /*--------------------------------------------------------------------------------*/ /* Main Function /*--------------------------------------------------------------------------------*/ int main(){ int a,n,f,l,j,w,p,v,d,e,q,h,u,b,y,x,c,i,g; FILE *obs_data, *ge_zero_post, *ge_post, *G0_prior, *pheno_data, *prob_pop, *out_allele, *DPMtimer, *parms; const gsl_rng_type *T; gsl_rng *r; int N_ind, num_loci, H_switch=1, init_clus, repeat, K_G, K_H, gibbs, burn_in; int approx_limit, every_iter, dum1, dum2, locus_count, dummy_switch; /* Timing declarations */ struct tms tbuff; clock_t start,end; double elapsed, massG=1.0, massH=1.0, a_0G, b_0G, a_0H, b_0H, fuzz=0.0001, gen_profile, prior_ge[2][2]; int temp_s, old_s, end_locus, locus , n_new, n_old, old_index; struct genetic_data gd1; struct theta_arms tha; unsigned int seed; int iarr[2]; double darr[2]; /*-----------------------------------------------------------------------------*/ /* The following are the necessary paths for input and output files /*-----------------------------------------------------------------------------*/ pheno_data=fopen("phenotype.txt","r"); /* Phenotype data file */ obs_data=fopen("genome.txt","r"); /* Genotypes data file */ parms=fopen("parm_setup.txt","r"); /* Parameter Setup file */ ge_zero_post=fopen("post_gezero.txt","w+"); /* Posterior output file for indicators of null cluster */ ge_post=fopen("post_ge.txt","w+"); /* Posterior genetic effects samples */ out_allele=fopen("out_allele.txt","w+"); /* Predictive samples for allele frequencies */ srand(seed); /*-------------------------------------------------------------------------------*/ /* Read in parameter setup file /*-------------------------------------------------------------------------------*/ fscanf(parms,"%d\n",&iarr[0]); N_ind=iarr[0]; fscanf(parms,"%d\n",&iarr[0]); num_loci=iarr[0]; fscanf(parms,"%d\n",&iarr[0]); gibbs=iarr[0]; fscanf(parms,"%d\n",&iarr[0]); burn_in=iarr[0]; fscanf(parms,"%d\n",&iarr[0]); repeat=iarr[0]; fscanf(parms,"%d\n",&iarr[0]); approx_limit=iarr[0]; fscanf(parms,"%d\n",&iarr[0]); init_clus=iarr[0]; fscanf(parms,"%d\n",&iarr[0]); seed=iarr[0]; fscanf(parms,"%lf ",&darr[0]); a_0G=darr[0]; fscanf(parms,"%lf ",&darr[0]); b_0G=darr[0]; fscanf(parms,"%lf ",&darr[0]); a_0H=darr[0]; fscanf(parms,"%lf\n",&darr[0]); b_0H=darr[0]; fscanf(parms,"%lf %lf\n",&gd1.mu_b0,&gd1.tau_b0); fscanf(parms,"%lf %lf\n",&gd1.alpha_tau,&gd1.lambda_tau); fscanf(parms,"%lf %lf\n",&gd1.alpha_s,&gd1.beta_s); fscanf(parms,"%d\n",&iarr[0]); every_iter=iarr[0]; fscanf(parms,"%lf %lf %lf %lf\n",&prior_ge[0][0],&prior_ge[0][1],&prior_ge[1][0],&prior_ge[1][1]); printf("STRUCTDPMv1.0: Bayesian Nonparametric Inference for Association \n Studies of Quantitative Traits in \n the presence of Population Structure \n \n Author: Nicholas M Pajewski (npajewsk at mcw edu)\n \n"); printf("Analysis parameters\n\n"); printf("Number of individuals=%d\nNumber of SNPs= %d\n",N_ind,num_loci); printf("Number of MCMC iterations= %d\n",gibbs); printf("Burn-in= %d\n",burn_in); printf("Number of subiterations for Neal's (2000) DP update= %d\n",repeat); printf("Finite limit from Ishwaran and James (2001)= %d\n",approx_limit); printf("Initial Clusters= %d\n",init_clus); printf("Random Number Seed= %d\n", seed); printf("Prior parameters for DP mass parameters: %lf %lf %lf %lf\n",a_0G, b_0G, a_0H, b_0H); printf("Prior parameters for beta0 in G0: %lf %lf\n", gd1.mu_b0, gd1.tau_b0); printf("Prior parameters for error precision: %lf %lf\n", gd1.alpha_tau, gd1.lambda_tau); printf("Output every %d iterations\n",every_iter); fclose(parms); /* Configuration storage for H: DP on Betas */ int z_ind[num_loci]; int index[num_loci]; int configuration[approx_limit][num_loci]; int n_phi_ge[approx_limit]; /* Configuration storage for G: DP on thetas */ int s[N_ind]; int n_phi[N_ind]; int pairwise_group[N_ind][N_ind]; /*----------------------------------------------------------------------------------*/ /* Initialize parameters in genetic data struct /*----------------------------------------------------------------------------------*/ gd1.finite_limit=approx_limit; gd1.samp_size=N_ind; gd1.total_loci=num_loci; gd1.num_parm=2; gd1.SNP_prior[0]=0.0; gd1.SNP_prior[1]=0.04614; gd1.MH_statistics[0]=0; gd1.MH_statistics[1]=0; tha.mu_theta=0.0; tha.tau_theta=0.04614; /*--------------------------------------------------------------------------------*/ /* ARMS Parameters /*--------------------------------------------------------------------------------*/ int err; /* Returns non-zero value if ARMS generator fails */ int ninit=4; /* # of points with which to make initial envelope */ int npoint=50; /* Maximum # of envelope points */ int nsamp=1; /* # of needed samples from target density */ int ncent=0; /* # of centile points to compute */ int neval=100; /* Maximum # of evaluations */ double qcent[10]={.05,.30,.70,.95}; /* Stores centiles */ int dometrop=0; double convex=1.0; double xl_theta=-50.0, xr_theta=50.0; double xsamp_theta[30]; double xsamp_cand[30]; double theta_init[10]={-10.0,-1.0,1.0,10.0}; double xcent_theta[10]; double xprev_theta[N_ind][num_loci]; double theta_prev; /*----------------------------------------------------------------------------------*/ /* Setup gsl matrices for phenotype/genotype data, Beta, Beta_0 Vector, /* and Prior on genetic effects /*----------------------------------------------------------------------------------*/ gsl_matrix *Y=gsl_matrix_calloc(gd1.samp_size,1); gsl_matrix *B0Z=gsl_matrix_calloc(gd1.samp_size,1); gsl_matrix *PRE=gsl_matrix_calloc(2,2); gsl_matrix *PMEAN=gsl_matrix_calloc(2,1); gsl_matrix *SIGMA=gsl_matrix_calloc(2,2); gsl_matrix *TPHI=gsl_matrix_calloc(2,1); gsl_matrix *GENP=gsl_matrix_calloc(gd1.samp_size,1); /*----------------------------------------------------------------------------------*/ /* Setup matrices for parameters of G_0 /* Read in parameters from file: /*----------------------------------------------------------------------------------*/ gsl_matrix_set_all(PRE,0.0); gsl_matrix_set_all(SIGMA,0.0); for(w=0;w0){ printf("error code= %d\n",err); exit(1); } /* Pull Sampled Value from ARMS Output */ gd1.theta[j][l]=xsamp_theta[0]; xprev_theta[j][l]=theta_prev; } /************ Draw Beta0_j **************/ gibbs_beta0j(r, &gd1, j, N_ind, s, n_phi, B0Z); } /************** Step 4: Update error variance *****************************/ for(n=0;nburn_in){ Bayes_Factor_Population(N_ind, s, pairwise_group); } /*------------------------------------------------------------------------------*/ /* Update H Configuration using Blocked Gibbs Sampler of Ishwaran and James (2001) /*------------------------------------------------------------------------------*/ for(j=1;j0){ gd1.distinctH[a]=1; K_H++; }else{ gd1.distinctH[a]=0; } } /*----------------------------------------------------------------------------------*/ /* Update Stick-Breaking Weights /*----------------------------------------------------------------------------------*/ stick_weight_new(r, massH, &gd1, approx_limit, n_phi_ge); /*----------------------------------------------------------------------------------*/ /* Update mass parameters using West's algorithm (1992) /*----------------------------------------------------------------------------------*/ massG=mass_samp(r, N_ind, massG, K_G, a_0G, b_0G, &gd1); massH=mass_samp(r, num_loci, massH, K_H, a_0H, b_0H, &gd1); /*----------------------------------------------------------------------------------*/ /* Posterior Predictive for Allele Frequencies & /* Output /*----------------------------------------------------------------------------------*/ post_pred(r, ge_zero_post, ge_post, out_allele, &gd1, K_G, massG, K_H, massH, g, burn_in, num_loci, N_ind, z_ind, s, n_phi, H_switch); /* Output iteration values to track Gibbs Loop progress */ if(g%every_iter==0){ printf("Iteration #: %d Clusters in G: %d MassG=%11.5f Clusters in H: %d MassH:%11.5f\n",g, K_G, massG, K_H, massH); } } fclose(ge_zero_post); fclose(ge_post); fclose(out_allele); /* Outputting program times */ end=times(&tbuff); DPMtimer=fopen("DPMtimer.txt","w+"); fprintf(DPMtimer,"real=%4.1lf user= %4.1lf system=%4.1lf\n", ((double)(end-start))/CLK_TCK, ((double) tbuff.tms_utime)/CLK_TCK, ((double) tbuff.tms_stime)/CLK_TCK); fclose(DPMtimer); /* Output Clustering Probabilities / */ prob_pop=fopen("cluster_probs.txt","w+"); Bayes_Factor_Output(prob_pop, gibbs, burn_in, N_ind, massG, pairwise_group); fclose(prob_pop); return 0; }