R Studio Course
To save in the memory of the program the data frame:
name you want to use for data frame = read.csv (“name you have the csv saved in your
computer”)
name you want to use for data frame <- read.csv (“name you have the csv saved in your
computer”)
To obtain the structure of your data frame:
str(name of the data frame)
To obtain a summary of your data frame:
summary(name of the data frame)
To obtain a subset of your data fram:
name of the subset you create = subset(name of the data frame, name of the collumn == “name
of the variable”)
To save your subset in your computer:
write.csv(name of the subset, “name of the document you want to save as.cvs”)
To see the variables your data frame has:
ls()
To see only a variable from your data frame:
name of the data frame$name of the variable you want to see
To obtain the mean, sd, and the summary of your variable:
mean(name of the data frame$name of the variable), sd(name of the data frame$name of the
variable), summary (name of the data frame$name of the variable)
To see exactly for which subject some values correspond:
which.min(name of the data frame$name of the variable)
To see the code allocated to a subject:
name of thee data frame$name of the collumn[value]
To make a plot:
plot(name of the data frame$name of the column for X axes, name of the data frame$name of
the column for Y axes)
To see the outliers:
outliers = subset(name of the data frame, name of the collumn > < == & name of the collumn
><==)
To view the outliers and see the number of outliers:
view(outliers)
nrow(outliers)
To view exactly the details of the outliers:
outliers[c(“name of the collumn”, “name of the collumn”, “name of the collumn”)]
getwd()
WHO = read.csv("WHO.csv")
str(WHO)
summary(WHO)
WHO_Europe = subset(WHO, Region == "Europe")
str(WHO_Europe)
write.csv(WHO_Europe, "WHO_Europe.csv")
ls()
rm(WHO_Europe)
ls()
Under15
WHO$Under15
mean(WHO$Under15)
sd(WHO$Under15)
summary(WHO$Under15)
outliers = subset(WHO, GNI > 10000 & FertilityRate > 2.5)
nrow(outliers)
outliers[c("Country", "GNI", "FertilityRate")]
hist(WHO$CellularSubscribers)
boxplot(WHO$LifeExpectancy ~ WHO$Region)
boxplot(WHO$LifeExpectancy ~ WHO$Region, xlab="", ylab="Life Expectancy", main="Life Expectancy
by Region")
table(WHO$Region)
tapply(WHO$Over60, WHO$Region, mean)
tapply(WHO$LiteracyRate, WHO$Region, min)
tapply(WHO$LiteracyRate, WHO$Region, min, na.rm=TRUE)
Introduction to R
Setting and getting the working directory
Use File > Change dir...
setwd("P:/Data/MATH/Hartlaub/DataAnalysis")
getwd()
Reading data (Creating a dataframe)
mydata=read.csv(file=file.choose())
Commands for dataframes
mydata #shows the entire data set
head(mydata) #shows the first 6 rows
tail(mydata) #shows the last 6 rows
str(mydata) #shows the variable names and types
names(mydata) #shows the variable names
ls() #shows a list of objects that are available
attach(mydata) #attaches the dataframe to the R search path, which makes it easy to access
variable names
Descriptive Statistics
mean(x) #computes the mean of the variable x
median(x) #computes the median of the variable x
sd(x) #computes the standard deviation of the variable x
IQR(x) #computer the IQR of the variable x
summary(x) #computes the 5-number summary and the mean of the variable x
t.test(x) #get a one sample t test
t.test(x,y) #get a two sample t test
t.test(x, y, paired=TRUE) #get a paired t test
cor(x,y) #computes the correlation coefficient
cor(mydata) #computes a correlation matrix
cor.test(x,y) #test plus CI for rho
Graphical Displays
windows(record=TRUE) #records your work, including plots
hist(x) #creates a histogram for the variable x
boxplot(x) # creates a boxplot for the variable x
cort
stem(x) #creates a stem plot for the variable x
plot(y~x) #creates a scatterplot of y versus x
plot(mydata) #provides a scatterplot matrix
abline(lm(y~x)) #adds regression line to plot
lines(lowess(x,y)) # adds lowess line (x,y) to plot
Liner Regression Models
regmodel=lm(y~x) #fit a regression model
summary(regmodel) #get results from fitting the regression model
anova(regmodel) #get the ANOVA table fro the regression fit
plot(regmodel) #get four plots, including normal probability plot, of residuals
fits=regmodel$fitted #store the fitted values in variable named "fits"
resids=regmodel$residuals #store the residual values in a varaible named "resids"
sresids=rstandard(regmodel) #store the standardized residuals in a variable named "sresids"
studresids=rstudent(regmodel) #store the studentized residuals in a variable named
"studresids"
beta1hat=regmodel$coeff[2] #assign the slope coefficient to the name "beta1hat"
qt(.975,15) # find the 97.5% percentile for a t distribution with 15 df
confint(regmodel) #CIs for all parameters
newx=data.frame(X=41) #create a new data frame with one new x* value of 41
predict.lm(regmodel,newx,interval="confidence") #get a CI for the mean at the value x*
predict.lm(model,newx,interval="prediction") #get a prediction interval for an individual Y value
at the value x*
hatvalues(regmodel) #get the leverage values (hi)
Model Selection
o library(leaps) #load the leaps package
o allmods = regsubsets(y~x1+x2+x3+x4, nbest=2, data=mydata) #(leaps package must be
loaded), identify best two models for 1, 2, 3 predictors
o summary(allmods) # get summary of best subsets
o summary(allmods)$adjr2 #adjusted R^2 for some models
o summary(allmods)$cp #Cp for some models
o plot(allmods, scale="adjr2") # plot that identifies models
o plot(allmods, scale="Cp") # plot that identifies models
o fullmodel=lm(y~., data=mydata) # regress y on everything in mydata
o MSE=(summary(fullmodel)$sigma)^2 # store MSE for the full model
o extractAIC(lm(y~x1+x2+x3), scale=MSE) #get Cp (equivalent to AIC)
o step(fullmodel, scale=MSE, direction="backward") #backward elimination
o step(fullmodel, scale=MSE, direction="forward") #forward elimination
o step(fullmodel, scale=MSE, direction="both") #stepwise regression
o none(lm(y~1) #regress y on the constant only
o step(none, scope=list(upper=fullmodel), scale=MSE) #use Cp in stepwise regression
Logistic Regression
table(y) #get a table of the distribution of y
mytable=table(y, x) #get a 2-way table of y by x
chisq.test(mytable) #Chi-sq test with Yates continuity correction
chisq.test(mytable, correction=FALSE) #Chi-sq test of independence without Yates continuity
correction
prop.table(table(y, x),1) #get a table of row proportions
prop.table(table(y, x),2) #get a table of column proportions
prop.test(c(39,22), c(100,100), correction=FALSE) #2-sample proportion test without Yates
continuity correction
plot(x,jitter(y,amount=0.05)) #jitter y in the plot
anova(reducedmodel, fullmodel, test="Chisq") #nested G test
drop1(mymodel, test="Chisq") #G tests to see what to drop next
as.factor(X) #create dummy variables for the levels of the variable X
model1=glm(y~as.factor(X), family=binomial) #fit model with the categories of X as predictors
summary(model1) #gives Z tests, residual deviance, and null deviance
anova(model1, test="Chisq") #test of H0: constant term is all that is needed. (i.e., nested G test
against the model y~1.)
confint(model1) #CIs for all parameters
confint(model1, parm="x") #CI for the coefficient of x
exp(confint(model1, parm="x")) #CI for odds ratio
shortmodel=glm(cbind(y1,y2)~x, family=binomial) binomial inputs
dresid=residuals(model1, type="deviance") #deviance residuals
presid=residuals(model1, type="pearson") #Pearson residuals
plot(residuals(model1, type="deviance")) #plot of deviance residuals
newx=data.frame(X=20) #set (X=20) for an upcoming prediction
predict(mymodel, newx, type="response") #get predicted probability at X=20
Analysis of Variance
t.test(y~x, var.equal=TRUE) #pooled t-test where x is a factor
x=as.factor(x) #coerce x to be a factor variable
tapply(y, x, mean) #get mean of y at each level of x
tapply(y, x, sd) #get stadard deviations of y at each level of x
tapply(y, x, length) #get sample sizes of y at each level of x
library(gplots) #load gplots package
o plotmeans(y~x) #means and 95% confidence intervals
AOVmodel=aov(y~x) #one-way ANOVA
summary(AOVmodel) #get ANOVA output
oneway.test(y~x, var.equal=TRUE) #one-way test output
library(car) #load car package
o levene.test(y,x) #Levene's test for equal variances
blockmodel=aov(y~x+block) #Randomized block design model with "block" as a variable
tapply(lm(y~x1:x2,mean) #get the mean of y for each cell of x1 by x2
anova(lm(y~x1+x2)) #a way to get a two-way ANOVA table
interaction.plot(FactorA, FactorB, y) #get an interaction plot
pairwise.t.test(y,x,p.adj="none") #pairwise t tests
pairwise.t.test(y,x,p.adj="bonferroni") #pairwise t tests
TukeyHSD(AOVmodel) #get Tukey CIs and P-values
plot(TukeyHSD(AOVmodel)) #get 95% family-wise CIs
library(multcomp) #load multcomp package
o contrast=rbind(c(.5,.5,-1/3,-1/3,-1/3)) #set up a contrast
o summary(glht(AOVmodel, linfct=mcp(x=contrast))) #test a contrast
o confint(glht(AOVmodel, linfct=mcp(x=contrast))) #CI for a contrast
kruskal.test(y~x) #Kruskal-Wallis test
friedman.test(y,x,block) #Friedman test for block design
Create a personalized histogram
# Create a histogram of the price of
# all the diamonds in the diamond data set.
ggplot(diamonds, aes(x = price)) +
geom_histogram(color = "black", fill = "DarkOrange", binwidth = 500) +
scale_x_continuous(labels = dollar, breaks = seq(0, 20000, 1000)) +
theme(axis.text.x = element_text(angle = 90)) +
xlab("Price") + ylab("Count")
# a) How many diamonds cost less than $500?
diamonds %>%
filter(price < 500) %>%
summarise(n = n())
# Break out the histogram of diamond prices by cut.
# You should have five histograms in separate
# panels on your resulting plot.
ggplot(diamonds, aes(x = price)) +
geom_histogram(color = "black", fill = "DarkOrange", binwidth = 25) +
scale_x_continuous(labels = dollar, breaks = seq(0, 4000, 100)) +
theme(axis.text.x = element_text(angle = 90)) +
coord_cartesian(c(0,4000)) +
facet_grid(cut~.) +
xlab("Price") + ylab("Count")
# a) Which cut has the highest priced diamond?
# Premium
# by(diamonds$price, diamonds$cut, max)
# by(diamonds$price, diamonds$cut, min)
# by(diamonds$price, diamonds$cut, median)
diamonds %>%
group_by(cut) %>%
summarise(max_price = max(price),
min_price = min(price),
median_price = median(price))