Load source code
Read data
# folder containing snp and otu data
ddata = 'data/'
# read bacteria
B = read.biome(paste0(ddata,'journal.pgen.1007580.s005.txt.gz'))
# read archaea (not used)
A = read.biome(paste0(ddata,'journal.pgen.1007580.s006.txt.gz'))
# read genotypes
X = read.gen(paste0(ddata,'gen.txt.gz'))
# N SNPs
Nsnp = nrow(X)
# N individuals
Nind = ncol(X)
# N OTUs
Notu = nrow(B)
try (if (Nind != ncol(B)) stop('Nind in B and X does not match'))
print(c(Nsnp, Notu, Nind))
## [1] 32204 4018 750
Main parameters (check main help)
h2 = 0.25
b2 = 0.25
Nqtl_y = 100
Notu_y = 25
Notu_y_g = 25
Nqtl_otu = 10
Nclust = 500
Nmiss = 75
Clustering (optional)
Cl = hclust(dist(B),method="ward.D2")
Bclust = cutree(Cl,Nclust)
Simulate data
# If no clustering, Bclust=NULL
s = SimuBiome(X, B, Bclust=Bclust, h2=h2, b2=b2, Nqtl_y=Nqtl_y, Notu_y=Notu_y, Notu_y_g=Notu_y_g)
# data saving
save(s,file='simubiome.Rdata')
print(names(s))
## [1] "y" "B" "gq" "gb" "b_otu"
## [6] "b_qtl" "qtl_list" "otu_list" "otu_qtl_list"
Output data
# simulated phenotype
y = s$y
hist(y, main='Simulated phenotypes')

# returns reordered B, always in log scale,
# B is shuffled in every call to SimuBiome
B = s$B
#--> some plots
par(mfrow=c(2,2))
hist(s$gq, main='Individual Genetic values')
hist(s$gb, main='Individual Microbiome values')
hist(abs(s$b_qtl), main='Genetic coefficients (abs values)')
hist(abs(s$b_otu), main='Microbiome coefficients (abs values)')

print(c('Nqtl ',length(s$b_qtl)))
## [1] "Nqtl " "100"
print(c('Notu causative ',length(s$b_otu)))
## [1] "Notu causative " "25"
Data preparation
#--> scale and transpose data
X = scale(t(X))
B = scale(t(B))
y = scale(y)
#--> prediction
tst = sample(seq(length(y)),size=Nmiss)
yNA = y
yNA[tst] = NA
Running Bayes C
WARNING: produces huge files and can take long
# These are prob priors for a variable to enter into the model
probin=0.001
p0=5
# Model with G and B, only 10k iterations for example, try at least 40k
fm_Cgb = doBayesC(yNA, X=X, B=B, out='bayCgb_', pi1=probin, pi2=probin, p0=p0, nIter=1e4)
# Only G
# fm_Cg = doBayesC(yNA, X=X, out='bayCg_', pi1=probin, p0=p0)
# Only B
# fm_Cb = doBayesC(yNA, B=B, out='bayCb_', pi2=probin, p0=p0)
# c = list('fm_Cgb'=fm_Cgb,'fm_Cg'=fm_Cg,'fm_Cb'=fm_Cb)
# save(c,file='fmc.Rdata')
Predictive accuracy
# Predicted y Bayes Cgb
yhat = fm_Cgb$fm$yHat[tst]
# plot y vs. yhat
plot(y[tst], yhat, xlab='Observed y', ylab='Predicted y')

rho = cor(y[tst], yhat)
print(rho)
## [1] 0.5247573
Estimated h2, b2, cov(g,b)
# This is matrix implementation of
# https://github.com/gdlc/BGLR-R/blob/master/inst/md/heritability.md
# (second method)
G=readBinMat('bayCgb_ETA_1_b.bin')
C=readBinMat('bayCgb_ETA_2_b.bin')
u=G%*%t(X)
b=C%*%t(B)
Y=matrix(rep(y,nrow(G)), nrow = nrow(G), byrow=TRUE)
varU=apply(u,1,var)
varB=apply(b,1,var)
covUB=(apply(u+b,1,var) - varU - varB)*0.5
varE=apply(Y-u-b,1,var)
h2g=varU/(varU+varB+varE)
h2b=varB/(varU+varB+varE)
h2gb=covUB/(varU+varB+varE)
# plots
plot(density(h2g),xlim=c(-0.05,1),main=c('Bayes Cgb',rho),xlab='var component')
lines(density(h2b),col='blue',lty=2)
lines(density(h2gb),col='red',lty=3)

print(c(mean(h2g),mean(h2b),mean(h2gb)))
## [1] 0.2690665 0.2104698 -0.0139672