Biostatistics with R programming

Set knit directory

1
knitr::opts_chunk$set(root.dir = "/Users/doublefire_chen/Downloads")

Set work directory

1
2
3
4
5
## CHANGE THIS TO YOUR OWN PATH!
setwd("/Users/doublefire_chen/Downloads")

### Check the working Directory using getwd(). It does not need any arguments.
getwd()

Load dataset from csv file

1
d <- read.delim2("./data_monday-3.csv", sep = ";", dec = ",")

The essential R commands for understanding your dataset

1
2
3
4
5
6
7
8
9
10
names(d)        # column, or variables names
head(d) # The first rows of the dataframe
head(d, n = 1) # The first n rows of the dataframe
tail(d) # The last rows of the dataframe
tail(d, n = 1) # The last n rows of the dataframe
nrow(d) # The number of rows
ncol(d) # The number of columns
dim(d) # The nxm dimensions of the dataframe
class(d) # The class of variable
summary(d) # summary statistics for d

Basic calculation and function

1
2
3
4
5
6
7
8
9
mean(d$weight) # calculates the mean of a vector
sd(d$bio1) # the standard deviation
nrow(d) # number of row
range(d$weight) # gives the minimum and maximum values of a vector
any(d$weight > 90)
all(d$weight < 90)
sum(d$weight > 60 & d$weight < 80) # calculate how many individuals have weight between 60 and 80?
View(d) # view dataset
unique(d$dose)

Histogram

1
hist(d$weight, breaks = 12, col = "darkorange", border = "darkblue", main = "Collected Weight Measurments", xlab = "Weight (kg)", ylab = "Freq", density=30, cex.axis = 1.3, cex.main = 2, labels = T, cex.lab = 1.3, xlim =c(40, 100)) 
1
2
3
4
5
6
7
8
9
# Use ggplot to draw histogram
class(d$weight)
d$weight
df <- data.frame(value = d$weight)
df
library(ggplot2)
ggplot(data = df, aes(x = value)) +
geom_histogram(binwidth = 12) +
labs(x ="Weight (kg)", y="Freq", title = "Collected Weight Measurments")

Plot

1
plot(d$weight, d$il6r,col = "red", type = "p", main = "Weights", xlab = "Weights(kg)", ylab = "il6r expression")

Density plot

1
2
3
plot(density(d$weight[d$sex == "f"]), col = "red", lwd = 2, xlab = "Weights (kg)", main = "Density plot for weight", xlim = c(35, 105))
lines(density(d$weight[d$sex == "m"]), col = "blue", lwd = 2)
legend("topright", legend = c("Male", "Female"), fill = c("red", "blue"))

Boxplot

1
2
3
4
5
boxplot(d$weight ~ d$gen, col = c("blue", "red", "pink"),  main = "Weights by genotype", xlab = "Genotype", ylab = "Weight(kg)")
# make two boxplot in one graph
par(mfrow = c(1, 2))
boxplot(d2$bio2)
boxplot(d2$bio4)

Test

Normal distribution test

1
2
3
4
5
6
7
hist(d2$bio3)

qqnorm(d2$bio3)

qqline(d2$bio3)

shapiro.test(d$bio1) # Shapiro-Wilk normality test

t-test

One sample t-test

1
2
t.test(d$bio1, mu = 17, conf.level = 0.99)
t.test(d$bio2)$conf.int # the 95% confidence interval for bio2 in the population

Two sample t-test

1
t.test(d2$bio3 ~ d2$sex)

Two sample paired t-test

1
t.test(d$marker1[d$drug1 == "1"], d$marker2[d$drug1 == "1"], conf.level = 0.95, paired = TRUE)

non-parametric test

1
2
3
4
5
6
7
# sign-ranked test
bp <- c(130, 140, 115, 135, 300)
wilcox.test(bp, mu = 120)
# rank-sum test
A <- c(130, 140, 115, 135, 150)
B <- c(125, 110, 105, 100)
wilcox.test(A, B)

correlation test

1
2
3
4
5
6
7
8
9
# Pearson correlation is the parametric test
cor.test(d$dose, d$bio1, method = "pearson")
# The non-parametric test is called Spearman's Rho
cor.test(d$dose, d$bio2, method = "spearman", exact = F)

# Liner regression
plot(d$dose, d$bio2, main = paste("Biomarker 2 by Dose"), xlab = "Dose", ylab = "Biomarker 2", pch = 19, col = "green")
title(main = ("r = -0.85, p = 2.2e-16"), line = 0.5, cex.main = 0.8)
abline(lm(d$bio2 ~ d$dose), col = "black", lwd = 2) # regression line (y~x)