Introduction to R
1 R Graphical User Interfaces
R provides several interfaces for users to interact with the programming environment. Understanding these
interfaces is crucial for effective R programming.
1.1 RStudio IDE
RStudio is the most popular integrated development environment for R. It consists of four main panes:
• Console: Where R commands are executed and output is displayed
• Script Editor: For writing and editing R scripts
• Environment/History: Shows variables in workspace and command history
• Files/Plots/Packages/Help: File browser, plot viewer, package manager, and help system
Key Features:
• Syntax highlighting and code completion
• Project management capabilities
• Integrated version control (Git/SVN)
• R Markdown support for reproducible research
• Debugging tools and profiling
1.2 R Console
The basic R console is the command-line interface where:
• Commands are entered after the prompt (>)
• Results are immediately displayed
• Multi-line commands show continuation prompt (+)
• History can be accessed using arrow keys
1.3 Alternative Interfaces
• R Commander: GUI-based interface for beginners
• Jupyter Notebooks: Web-based interface supporting multiple languages
• Visual Studio Code: With R extensions for advanced users
1
2 Data Import and Export
Data import and export are fundamental operations in R for working with external data sources.
2.1 Reading Data Files
CSV Files:
# Basic CSV reading
data <- read . csv ( " filename . csv " )
# With specific parameters
data <- read . csv ( " filename . csv " , header = TRUE , sep = " ," ,
stringsAsFactors = FALSE )
# Using readr package ( tidyverse )
library ( readr )
data <- read _ csv ( " filename . csv " )
Excel Files:
# Using readxl package
library ( readxl )
data <- read _ excel ( " filename . xlsx " , sheet = " Sheet1 " )
# Multiple sheets
sheet _ names <- excel _ sheets ( " filename . xlsx " )
data _ list <- lapply ( sheet _ names , function ( x ) read _ excel ( " filename . xlsx " ,
sheet = x ) )
Text Files:
# Tab - delimited files
data <- read . table ( " filename . txt " , header = TRUE , sep = " \ t " )
# Fixed - width files
data <- read . fwf ( " filename . txt " , widths = c (10 , 15 , 8) )
2.2 Database Connections
# SQLite example
library ( RSQLite )
con <- dbConnect ( SQLite () , " database . db " )
data <- dbReadTable ( con , " table _ name " )
dbDisconnect ( con )
# MySQL example
library ( RMySQL )
con <- dbConnect ( MySQL () , user = " username " , password = " password " ,
dbname = " database " , host = " localhost " )
data <- dbGetQuery ( con , " SELECT ␣ * ␣ FROM ␣ table _ name " )
2
2.3 Exporting Data
# CSV export
write . csv ( data , " output . csv " , row . names = FALSE )
# Excel export
library ( openxlsx )
write . xlsx ( data , " output . xlsx " )
# Multiple sheets
data _ list <- list ( " Sheet1 " = data1 , " Sheet2 " = data2 )
write . xlsx ( data _ list , " output . xlsx " )
3 Attribute and Data Types
Understanding R’s data types and structures is essential for effective data manipulation and analysis.
3.1 Basic Data Types
Numeric:
# Integer
x <- 5 L
class ( x ) # " integer "
# Double ( default numeric )
y <- 5.5
class ( y ) # " numeric "
# Scientific notation
z <- 1.5 e3 # 1500
Character:
# Character strings
name <- " John ␣ Doe "
text <- ’ Single ␣ quotes ␣ work ␣ too ’
# String operations
paste ( " Hello " , " World " ) # " Hello World "
nchar ( " Hello " ) # 5
substr ( " Hello ␣ World " , 1 , 5) # " Hello "
Logical:
# Boolean values
is _ true <- TRUE
is _ false <- FALSE
# Logical operations
TRUE & FALSE # FALSE ( AND )
TRUE | FALSE # TRUE ( OR )
! TRUE # FALSE ( NOT )
Special Values:
3
# Missing values
NA # Not Available
is . na ( x ) # Check for NA
# Infinite values
Inf # Positive infinity
- Inf # Negative infinity
is . infinite ( x )
# Not a Number
NaN # Not a Number
is . nan ( x )
# NULL values
NULL # Empty object
is . null ( x )
3.2 Data Structures
Vectors:
# Creating vectors
numeric _ vector <- c (1 , 2 , 3 , 4 , 5)
character _ vector <- c ( " a " , " b " , " c " )
logical _ vector <- c ( TRUE , FALSE , TRUE )
# Vector operations
length ( numeric _ vector ) # 5
numeric _ vector [1] # First element
numeric _ vector [ c (1 ,3) ] # Elements 1 and 3
numeric _ vector [ numeric _ vector > 3] # Conditional selection
Matrices:
# Creating matrices
matrix1 <- matrix (1:12 , nrow = 3 , ncol = 4)
matrix2 <- matrix (1:12 , nrow = 3 , byrow = TRUE )
# Matrix operations
dim ( matrix1 ) # Dimensions
nrow ( matrix1 ) # Number of rows
ncol ( matrix1 ) # Number of columns
matrix1 [2 , 3] # Element at row 2 , column 3
matrix1 [2 , ] # Entire row 2
matrix1 [ , 3] # Entire column 3
Lists:
# Creating lists ( can contain different data types )
my _ list <- list (
numbers = c (1 , 2 , 3) ,
characters = c ( " a " , " b " , " c " ) ,
logical = TRUE ,
matrix = matrix (1:6 , nrow = 2)
)
4
# Accessing list elements
my _ list $ numbers # By name
my _ list [[1]] # By position
my _ list [[ " numbers " ]] # By name with brackets
Data Frames:
# Creating data frames
df <- data . frame (
name = c ( " Alice " , " Bob " , " Charlie " ) ,
age = c (25 , 30 , 35) ,
married = c ( TRUE , FALSE , TRUE )
)
# Data frame operations
str ( df ) # Structure
summary ( df ) # Summary statistics
df $ name # Access column
df [1 , ] # First row
df [ , " age " ] # Age column
df [ df $ age > 30 , ] # Conditional selection
Factors:
# Creating factors ( categorical data )
gender <- factor ( c ( " M " , " F " , " M " , " F " , " M " ) )
education <- factor ( c ( " High ␣ School " , " College " , " Graduate " ) ,
levels = c ( " High ␣ School " , " College " , " Graduate " ) ,
ordered = TRUE )
# Factor operations
levels ( gender ) # Get levels
nlevels ( gender ) # Number of levels
table ( gender ) # Frequency table
4 Descriptive Statistics
Descriptive statistics summarize and describe the main features of a dataset.
4.1 Measures of Central Tendency
Mean:
# Arithmetic mean
data <- c (1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 , 9 , 10)
mean ( data ) # 5.5
# Handling missing values
data _ with _ na <- c (1 , 2 , NA , 4 , 5)
mean ( data _ with _ na ) # NA
mean ( data _ with _ na , na . rm = TRUE ) # 3
# Trimmed mean ( remove outliers )
mean ( data , trim = 0.1) # Remove 10% from each end
5
Median:
# Median ( middle value )
median ( data ) # 5.5
median ( c (1 , 2 , 3 , 4 , 5) ) # 3 ( odd number of values )
# Robust to outliers
data _ with _ outlier <- c (1 , 2 , 3 , 4 , 100)
mean ( data _ with _ outlier ) # 22 ( affected by outlier )
median ( data _ with _ outlier ) # 3 ( not affected )
Mode:
# Mode ( most frequent value ) - no built - in function
get _ mode <- function ( x ) {
ux <- unique ( x )
ux [ which . max ( tabulate ( match (x , ux ) ) ) ]
}
data _ mode <- c (1 , 2 , 2 , 3 , 3 , 3 , 4)
get _ mode ( data _ mode ) # 3
4.2 Measures of Variability
Range:
# Range ( difference between max and min )
range ( data ) # c (1 , 10)
diff ( range ( data ) ) # 9
max ( data ) - min ( data ) # 9
Variance and Standard Deviation:
# Variance ( average squared deviation from mean )
var ( data ) # 9.166667
# Standard deviation ( square root of variance )
sd ( data ) # 3.02765
# Population vs sample
# R uses sample formulas by default (n -1 denominator )
# For population : multiply by (n -1) / n
n <- length ( data )
pop _ var <- var ( data ) * (n -1) / n
pop _ sd <- sqrt ( pop _ var )
Quantiles and Percentiles:
# Quantiles
quantile ( data ) # 0% , 25% , 50% , 75% , 100%
quantile ( data , probs = c (0.1 , 0.9) ) # 10 th and 90 th percentiles
# Interquartile Range ( IQR )
IQR ( data ) # Q3 - Q1
quantile ( data , 0.75) - quantile ( data , 0.25)
6
4.3 Distribution Shape
Skewness:
# Using moments package
library ( moments )
skewness ( data ) # Measure of asymmetry
# > 0: right - skewed , < 0: left - skewed , = 0: symmetric
Kurtosis:
# Measure of tail heaviness
kurtosis ( data ) # Normal distribution has kurtosis = 3
# > 3: heavy tails , < 3: light tails
4.4 Summary Functions
Built-in Summary:
# Comprehensive summary
summary ( data ) # Min , Q1 , Median , Mean , Q3 , Max
# For data frames
df <- data . frame ( x = rnorm (100) , y = runif (100) )
summary ( df ) # Summary for each column
Descriptive Statistics by Group:
# Using aggregate function
df <- data . frame (
group = rep ( c ( " A " , " B " ) , each = 50) ,
value = c ( rnorm (50 , 10 , 2) , rnorm (50 , 15 , 3) )
)
aggregate ( value ~ group , data = df , FUN = mean )
aggregate ( value ~ group , data = df , FUN = sd )
# Multiple statistics
aggregate ( value ~ group , data = df , FUN = function ( x ) c ( mean = mean ( x ) , sd
= sd ( x ) ) )
5 Exploratory Data Analysis
Exploratory Data Analysis (EDA) is the process of analyzing datasets to summarize their main characteris-
tics, often using statistical graphics and other data visualization methods.
5.1 Data Structure Examination
Basic Data Inspection:
# Load example dataset
data ( mtcars )
# Basic structure
str ( mtcars ) # Structure of data
7
dim ( mtcars ) # Dimensions
nrow ( mtcars ) # Number of rows
ncol ( mtcars ) # Number of columns
# First and last observations
head ( mtcars ) # First 6 rows
tail ( mtcars ) # Last 6 rows
head ( mtcars , 10) # First 10 rows
# Variable names
names ( mtcars ) # Column names
colnames ( mtcars ) # Same as names ()
rownames ( mtcars ) # Row names
Missing Value Analysis:
# Check for missing values
sum ( is . na ( mtcars ) ) # Total missing values
colSums ( is . na ( mtcars ) ) # Missing values per column
any ( is . na ( mtcars ) ) # Any missing values ?
# Missing value patterns
library ( VIM )
aggr ( mtcars , col = c ( ’ navyblue ’ , ’ red ’) ,
numbers = TRUE , sortVars = TRUE )
# Create missing values for demonstration
mtcars _ na <- mtcars
mtcars _ na [1:5 , " mpg " ] <- NA
mtcars _ na [3:7 , " hp " ] <- NA
# Visualize missing patterns
library ( mice )
md . pattern ( mtcars _ na )
5.2 Univariate Analysis
Continuous Variables:
# Summary statistics
summary ( mtcars $ mpg )
# Detailed statistics using psych package
library ( psych )
describe ( mtcars $ mpg ) # Comprehensive descriptive statistics
# Distribution analysis
hist ( mtcars $ mpg , breaks = 10 , main = " Distribution ␣ of ␣ MPG " )
boxplot ( mtcars $ mpg , main = " Boxplot ␣ of ␣ MPG " )
# Density plot
plot ( density ( mtcars $ mpg ) , main = " Density ␣ Plot ␣ of ␣ MPG " )
# Quantile - Quantile plot
qqnorm ( mtcars $ mpg )
8
qqline ( mtcars $ mpg )
Categorical Variables:
# Frequency analysis
table ( mtcars $ cyl ) # Frequency table
prop . table ( table ( mtcars $ cyl ) ) # Proportions
# Bar plot
barplot ( table ( mtcars $ cyl ) , main = " Distribution ␣ of ␣ Cylinders " )
# Using factors
mtcars $ cyl _ factor <- factor ( mtcars $ cyl )
summary ( mtcars $ cyl _ factor )
5.3 Bivariate Analysis
Continuous vs Continuous:
# Correlation
cor ( mtcars $ mpg , mtcars $ hp ) # Pearson correlation
cor ( mtcars $ mpg , mtcars $ hp , method = " spearman " ) # Spearman correlation
# Scatter plot
plot ( mtcars $ hp , mtcars $ mpg ,
xlab = " Horsepower " , ylab = " Miles ␣ per ␣ Gallon " ,
main = " MPG ␣ vs ␣ Horsepower " )
# Add regression line
abline ( lm ( mpg ~ hp , data = mtcars ) , col = " red " )
# Correlation matrix
cor _ matrix <- cor ( mtcars )
print ( cor _ matrix )
# Visualize correlation matrix
library ( corrplot )
corrplot ( cor _ matrix , method = " circle " )
Continuous vs Categorical:
# Box plots by group
boxplot ( mpg ~ cyl , data = mtcars ,
xlab = " Number ␣ of ␣ Cylinders " , ylab = " Miles ␣ per ␣ Gallon " )
# Summary statistics by group
aggregate ( mpg ~ cyl , data = mtcars , FUN = summary )
# Violin plots using ggplot2
library ( ggplot2 )
ggplot ( mtcars , aes ( x = factor ( cyl ) , y = mpg ) ) +
geom _ violin () +
geom _ boxplot ( width = 0.1) +
labs ( x = " Number ␣ of ␣ Cylinders " , y = " Miles ␣ per ␣ Gallon " )
Categorical vs Categorical:
9
# Contingency table
table ( mtcars $ cyl , mtcars $ gear )
# Chi - square test
chisq . test ( mtcars $ cyl , mtcars $ gear )
# Mosaic plot
mosaicplot ( table ( mtcars $ cyl , mtcars $ gear ) ,
main = " Cylinders ␣ vs ␣ Gears " )
5.4 Outlier Detection
Statistical Methods:
# Using IQR method
Q1 <- quantile ( mtcars $ mpg , 0.25)
Q3 <- quantile ( mtcars $ mpg , 0.75)
IQR _ val <- IQR ( mtcars $ mpg )
# Define outliers
lower _ bound <- Q1 - 1.5 * IQR _ val
upper _ bound <- Q3 + 1.5 * IQR _ val
# Identify outliers
outliers <- mtcars $ mpg < lower _ bound | mtcars $ mpg > upper _ bound
mtcars [ outliers , ]
# Using Z - score method
z _ scores <- abs ( scale ( mtcars $ mpg ) )
outliers _ z <- z _ scores > 3
mtcars [ outliers _z , ]
Visual Methods:
# Box plot outliers
boxplot ( mtcars $ mpg , main = " MPG ␣ Outliers " ) $ out
# Scatter plot with outliers highlighted
plot ( mtcars $ hp , mtcars $ mpg )
points ( mtcars $ hp [ outliers ] , mtcars $ mpg [ outliers ] , col = " red " , pch = 19)
6 Visualization Before Analysis
Data visualization is crucial for understanding patterns, trends, and relationships in data before conducting
formal statistical analysis.
6.1 Base R Graphics
Basic Plots:
# Scatter plot
plot ( mtcars $ hp , mtcars $ mpg ,
xlab = " Horsepower " , ylab = " Miles ␣ per ␣ Gallon " ,
10
main = " MPG ␣ vs ␣ Horsepower " ,
pch = 19 , col = " blue " )
# Line plot
x <- 1:10
y <- x ^2
plot (x , y , type = " l " , lwd = 2 , col = " red " )
# Multiple lines
plot (x , y , type = " l " , col = " red " , ylim = c (0 , 150) )
lines (x , x ^1.5 , col = " blue " )
lines (x , x ^2.5 , col = " green " )
legend ( " topleft " , legend = c ( " x ^2 " , " x ^1.5 " , " x ^2.5 " ) ,
col = c ( " red " , " blue " , " green " ) , lty = 1)
Histograms and Density Plots:
# Histogram
hist ( mtcars $ mpg ,
breaks = 10 ,
main = " Distribution ␣ of ␣ MPG " ,
xlab = " Miles ␣ per ␣ Gallon " ,
col = " lightblue " ,
border = " black " )
# Density plot
plot ( density ( mtcars $ mpg ) ,
main = " Density ␣ Plot ␣ of ␣ MPG " ,
xlab = " Miles ␣ per ␣ Gallon " ,
lwd = 2 , col = " red " )
# Overlay histogram and density
hist ( mtcars $ mpg , freq = FALSE , col = " lightblue " ,
main = " MPG ␣ Distribution " )
lines ( density ( mtcars $ mpg ) , col = " red " , lwd = 2)
Box Plots:
# Single box plot
boxplot ( mtcars $ mpg ,
main = " MPG ␣ Distribution " ,
ylab = " Miles ␣ per ␣ Gallon " )
# Multiple box plots
boxplot ( mpg ~ cyl , data = mtcars ,
main = " MPG ␣ by ␣ Number ␣ of ␣ Cylinders " ,
xlab = " Cylinders " , ylab = " Miles ␣ per ␣ Gallon " ,
col = c ( " red " , " blue " , " green " ) )
Bar Plots:
# Simple bar plot
counts <- table ( mtcars $ cyl )
barplot ( counts ,
main = " Number ␣ of ␣ Cars ␣ by ␣ Cylinder " ,
xlab = " Cylinders " , ylab = " Frequency " ,
11
col = " steelblue " )
# Grouped bar plot
counts _ gear <- table ( mtcars $ cyl , mtcars $ gear )
barplot ( counts _ gear ,
main = " Cars ␣ by ␣ Cylinder ␣ and ␣ Gear " ,
xlab = " Gears " , ylab = " Frequency " ,
col = c ( " red " , " blue " , " green " ) ,
legend = rownames ( counts _ gear ) ,
beside = TRUE )
6.2 ggplot2 Graphics
Grammar of Graphics:
library ( ggplot2 )
# Basic scatter plot
ggplot ( mtcars , aes ( x = hp , y = mpg ) ) +
geom _ point () +
labs ( title = " MPG ␣ vs ␣ Horsepower " ,
x = " Horsepower " , y = " Miles ␣ per ␣ Gallon " )
# Enhanced scatter plot
ggplot ( mtcars , aes ( x = hp , y = mpg , color = factor ( cyl ) ) ) +
geom _ point ( size = 3 , alpha = 0.7) +
geom _ smooth ( method = " lm " , se = FALSE ) +
labs ( title = " MPG ␣ vs ␣ Horsepower ␣ by ␣ Cylinders " ,
x = " Horsepower " , y = " Miles ␣ per ␣ Gallon " ,
color = " Cylinders " ) +
theme _ minimal ()
Histograms and Density Plots:
# Histogram
ggplot ( mtcars , aes ( x = mpg ) ) +
geom _ histogram ( bins = 10 , fill = " skyblue " , color = " black " ) +
labs ( title = " Distribution ␣ of ␣ MPG " , x = " Miles ␣ per ␣ Gallon " , y = " Count
")
# Density plot
ggplot ( mtcars , aes ( x = mpg ) ) +
geom _ density ( fill = " lightblue " , alpha = 0.7) +
labs ( title = " Density ␣ Plot ␣ of ␣ MPG " , x = " Miles ␣ per ␣ Gallon " )
# Multiple densities
ggplot ( mtcars , aes ( x = mpg , fill = factor ( cyl ) ) ) +
geom _ density ( alpha = 0.5) +
labs ( title = " MPG ␣ Distribution ␣ by ␣ Cylinders " ,
x = " Miles ␣ per ␣ Gallon " , fill = " Cylinders " )
Box Plots:
# Box plot
ggplot ( mtcars , aes ( x = factor ( cyl ) , y = mpg ) ) +
12
geom _ boxplot ( fill = " lightgreen " ) +
labs ( title = " MPG ␣ by ␣ Number ␣ of ␣ Cylinders " ,
x = " Cylinders " , y = " Miles ␣ per ␣ Gallon " )
# Violin plot
ggplot ( mtcars , aes ( x = factor ( cyl ) , y = mpg ) ) +
geom _ violin ( fill = " lightcoral " ) +
geom _ boxplot ( width = 0.1 , fill = " white " ) +
labs ( title = " MPG ␣ Distribution ␣ by ␣ Cylinders " ,
x = " Cylinders " , y = " Miles ␣ per ␣ Gallon " )
Faceting:
# Facet wrap
ggplot ( mtcars , aes ( x = hp , y = mpg ) ) +
geom _ point () +
facet _ wrap ( ~ cyl , labeller = labeller ( cyl = function ( x ) paste ( "
Cylinders : " , x ) ) ) +
labs ( title = " MPG ␣ vs ␣ Horsepower ␣ by ␣ Cylinders " ,
x = " Horsepower " , y = " Miles ␣ per ␣ Gallon " )
# Facet grid
ggplot ( mtcars , aes ( x = hp , y = mpg ) ) +
geom _ point () +
facet _ grid ( cyl ~ gear ) +
labs ( title = " MPG ␣ vs ␣ Horsepower ␣ by ␣ Cylinders ␣ and ␣ Gears " )
6.3 Correlation and Heatmap Visualization
Correlation Heatmap:
# Using corrplot
library ( corrplot )
cor _ matrix <- cor ( mtcars )
corrplot ( cor _ matrix , method = " color " ,
type = " upper " , order = " hclust " ,
tl . cex = 0.8 , tl . col = " black " )
# Using ggplot2
library ( reshape2 )
cor _ melted <- melt ( cor _ matrix )
ggplot ( cor _ melted , aes ( Var1 , Var2 , fill = value ) ) +
geom _ tile () +
scale _ fill _ gradient2 ( low = " blue " , high = " red " , mid = " white " ,
midpoint = 0 , limit = c ( -1 ,1) , space = " Lab " ,
name = " Correlation " ) +
theme _ minimal () +
theme ( axis . text . x = element _ text ( angle = 45 , vjust = 1 , size = 10 ,
hjust = 1) ) +
coord _ fixed ()
Pairs Plot:
# Simple pairs plot
pairs ( mtcars [ , c ( " mpg " , " hp " , " wt " , " qsec " ) ])
13
# Enhanced pairs plot using GGally
library ( GGally )
ggpairs ( mtcars [ , c ( " mpg " , " hp " , " wt " , " qsec " , " cyl " ) ] ,
aes ( color = factor ( cyl ) ) )
7 Analytics for Unstructured Data
Unstructured data, particularly text data, requires specialized techniques for analysis. This section covers
fundamental text mining and analysis techniques in R.
7.1 Text Data Preparation
Basic Text Operations:
# Sample text data
text _ data <- c ( " This ␣ is ␣ the ␣ first ␣ document . " ,
" This ␣ document ␣ is ␣ the ␣ second ␣ one . " ,
" And ␣ this ␣ is ␣ the ␣ third ␣ document . " ,
" Is ␣ this ␣ the ␣ first ␣ document ? " )
# Basic string operations
nchar ( text _ data ) # Character count
tolower ( text _ data ) # Convert to lowercase
toupper ( text _ data ) # Convert to uppercase
# String manipulation
gsub ( " document " , " text " , text _ data ) # Replace words
grep ( " first " , text _ data ) # Find pattern matches
grepl ( " first " , text _ data ) # Logical vector of matches
Text Preprocessing with tm Package:
library ( tm )
# Create a corpus
corpus <- Corpus ( VectorSource ( text _ data ) )
# Preprocessing functions
corpus <- tm _ map ( corpus , content _ transformer ( tolower ) ) # Lowercase
corpus <- tm _ map ( corpus , re movePu nctuat ion ) # Remove
punctuation
corpus <- tm _ map ( corpus , removeNumbers ) # Remove numbers
corpus <- tm _ map ( corpus , removeWords , stopwords ( " english " ) ) # Remove
stopwords
corpus <- tm _ map ( corpus , stripWhitespace ) # Remove extra
whitespace
# Stemming ( reduce words to root form )
library ( SnowballC )
corpus <- tm _ map ( corpus , stemDocument )
# View preprocessed text
lapply ( corpus , as . character )
14
7.2 Document-Term Matrix
Creating DTM:
# Create Document - Term Matrix
dtm <- Do cu me ntT er mM at ri x ( corpus )
# Inspect DTM
inspect ( dtm ) # View matrix
dim ( dtm ) # Dimensions
Terms ( dtm ) # Get terms
Docs ( dtm ) # Get document names
# Convert to matrix for analysis
dtm _ matrix <- as . matrix ( dtm )
print ( dtm _ matrix )
# Term frequency
term _ freq <- colSums ( dtm _ matrix )
print ( sort ( term _ freq , decreasing = TRUE ) )
Term Frequency Analysis:
# Most frequent terms
findFreqTerms ( dtm , lowfreq = 2)
# Word frequency plot
freq _ df <- data . frame ( word = names ( term _ freq ) , freq = term _ freq )
freq _ df <- freq _ df [ order ( freq _ df $ freq , decreasing = TRUE ) , ]
# Bar plot of most frequent terms
library ( ggplot2 )
top _ terms <- head ( freq _ df , 10)
ggplot ( top _ terms , aes ( x = reorder ( word , freq ) , y = freq ) ) +
geom _ bar ( stat = " identity " , fill = " steelblue " ) +
coord _ flip () +
labs ( title = " Most ␣ Frequent ␣ Terms " , x = " Terms " , y = " Frequency " )
# Word cloud
library ( wordcloud )
wordcloud ( words = freq _ df $ word , freq = freq _ df $ freq ,
min . freq = 1 , max . words = 100 ,
random . order = FALSE , rot . per = 0.35 ,
colors = brewer . pal (8 , " Dark2 " ) )
7.3 Advanced Text Analysis
TF-IDF (Term Frequency-Inverse Document Frequency):
# Calculate TF - IDF
tfidf <- weightTfIdf ( dtm )
tfidf _ matrix <- as . matrix ( tfidf )
# Most important terms per document
apply ( tfidf _ matrix , 1 , function ( x ) {
names ( sort (x , decreasing = TRUE ) [1:3])
15
})
# Overall TF - IDF scores
tfidf _ scores <- colSums ( tfidf _ matrix )
print ( sort ( tfidf _ scores , decreasing = TRUE ) )
Document Similarity:
# Cosine similarity between documents
library ( proxy )
doc _ similarity <- dist ( dtm _ matrix , method = " cosine " )
print ( as . matrix ( doc _ similarity ) )
# Hierarchical clustering of documents
hc <- hclust ( doc _ similarity )
plot ( hc , main = " Document ␣ Clustering " )
# Dendrogram
library ( dendextend )
dend <- as . dendrogram ( hc )
plot ( dend , main = " Document ␣ Dendrogram " )
7.4 Sentiment Analysis
Lexicon-based Sentiment Analysis:
# Using tidytext package
library ( tidytext )
library ( dplyr )
# Sample text with sentiment
text _ sentiment <- data . frame (
text = c ( " I ␣ love ␣ this ␣ product ! ␣ It ’ s ␣ amazing ! " ,
" This ␣ is ␣ terrible . ␣ I ␣ hate ␣ it . " ,
" It ’ s ␣ okay , ␣ nothing ␣ special . " ,
" Absolutely ␣ fantastic ! ␣ Highly ␣ recommend ! " ,
" Worst ␣ purchase ␣ ever . ␣ Very ␣ disappointed . " ) ,
doc _ id = 1:5
)
# Tokenize text
text _ tokens <- text _ sentiment % >%
unnest _ tokens ( word , text )
# Get sentiment scores using AFINN lexicon
sentiment _ scores <- text _ tokens % >%
inner _ join ( get _ sentiments ( " afinn " ) , by = " word " ) % >%
group _ by ( doc _ id ) % >%
summarise ( sentiment _ score = sum ( value ) )
print ( sentiment _ scores )
# Using bing lexicon ( positive / negative )
sentiment _ bing <- text _ tokens % >%
inner _ join ( get _ sentiments ( " bing " ) , by = " word " ) % >%
16
count ( doc _ id , sentiment ) % >%
spread ( sentiment , n , fill = 0) % >%
mutate ( sentiment _ score = positive - negative )
print ( sentiment _ bing )
Emotion Analysis:
# Using NRC emotion lexicon
emotions <- text _ tokens % >%
inner _ join ( get _ sentiments ( " nrc " ) , by = " word " ) % >%
count ( doc _ id , sentiment ) % >%
spread ( sentiment , n , fill = 0)
print ( emotions )
# Visualize emotions
library ( ggplot2 )
emotion _ summary <- text _ tokens % >%
inner _ join ( get _ sentiments ( " nrc " ) , by = " word " ) % >%
count ( sentiment ) % >%
filter ( ! sentiment % in % c ( " positive " , " negative " ) )
ggplot ( emotion _ summary , aes ( x = reorder ( sentiment , n ) , y = n ) ) +
geom _ bar ( stat = " identity " , fill = " coral " ) +
coord _ flip () +
labs ( title = " Emotion ␣ Analysis " , x = " Emotions " , y = " Count " )
7.5 Topic Modeling
Latent Dirichlet Allocation (LDA):
# Topic modeling with topicmodels package
library ( topicmodels )
# Larger sample dataset for topic modeling
sample _ docs <- c (
" The ␣ economy ␣ is ␣ growing ␣ steadily ␣ with ␣ low ␣ unemployment ␣ rates . " ,
" Stock ␣ market ␣ reaches ␣ new ␣ highs ␣ as ␣ investors ␣ remain ␣ optimistic . " ,
" Climate ␣ change ␣ poses ␣ significant ␣ challenges ␣ to ␣ our ␣ environment . " ,
" Renewable ␣ energy ␣ sources ␣ are ␣ becoming ␣ more ␣ cost - effective . " ,
" New ␣ medical ␣ breakthrough ␣ offers ␣ hope ␣ for ␣ cancer ␣ patients . " ,
" Healthcare ␣ costs ␣ continue ␣ to ␣ rise ␣ across ␣ the ␣ nation . " ,
" Education ␣ reform ␣ focuses ␣ on ␣ improving ␣ student ␣ outcomes . " ,
" Technology ␣ integration ␣ in ␣ schools ␣ enhances ␣ learning ␣ experiences . " ,
" Social ␣ media ␣ platforms ␣ face ␣ increased ␣ scrutiny ␣ over ␣ privacy . " ,
" Artificial ␣ intelligence ␣ transforms ␣ various ␣ industries . "
)
# Create corpus and DTM
sample _ corpus <- Corpus ( VectorSource ( sample _ docs ) )
sample _ corpus <- tm _ map ( sample _ corpus , content _ transformer ( tolower ) )
sample _ corpus <- tm _ map ( sample _ corpus , re movePu nctuat ion )
sample _ corpus <- tm _ map ( sample _ corpus , removeNumbers )
sample _ corpus <- tm _ map ( sample _ corpus , removeWords , stopwords ( " english " ) )
17
sample _ corpus <- tm _ map ( sample _ corpus , stripWhitespace )
sample _ dtm <- D oc um en tT er mM at ri x ( sample _ corpus )
# Fit LDA model
lda _ model <- LDA ( sample _ dtm , k = 3 , control = list ( seed = 123) )
# Extract topics
topics <- tidy ( lda _ model , matrix = " beta " )
print ( topics )
# Top terms per topic
top _ terms <- topics % >%
group _ by ( topic ) % >%
top _ n (5 , beta ) % >%
ungroup () % >%
arrange ( topic , - beta )
print ( top _ terms )
# Visualize topics
top _ terms % >%
mutate ( term = reorder _ within ( term , beta , topic ) ) % >%
ggplot ( aes ( term , beta , fill = factor ( topic ) ) ) +
geom _ col ( show . legend = FALSE ) +
facet _ wrap ( ~ topic , scales = " free " ) +
coord _ flip () +
scale _ x _ reordered () +
labs ( title = " Top ␣ Terms ␣ per ␣ Topic " )
# Document - topic probabilities
doc _ topics <- tidy ( lda _ model , matrix = " gamma " )
print ( doc _ topics )
7.6 Text Classification
Naive Bayes Classification:
# Text classification example
library ( e1071 )
library ( caret )
# Sample classification data
text _ class _ data <- data . frame (
text = c ( " This ␣ product ␣ is ␣ amazing ␣ and ␣ works ␣ perfectly " ,
" Terrible ␣ quality , ␣ broke ␣ after ␣ one ␣ day " ,
" Good ␣ value ␣ for ␣ money , ␣ satisfied ␣ with ␣ purchase " ,
" Worst ␣ product ␣ ever , ␣ complete ␣ waste ␣ of ␣ money " ,
" Excellent ␣ service ␣ and ␣ fast ␣ delivery " ,
" Poor ␣ customer ␣ service , ␣ very ␣ disappointed " ,
" Great ␣ product , ␣ highly ␣ recommend ␣ to ␣ others " ,
" Defective ␣ item , ␣ requesting ␣ immediate ␣ refund " ) ,
sentiment = c ( " positive " , " negative " , " positive " , " negative " ,
" positive " , " negative " , " positive " , " negative " )
18
)
# Create corpus and DTM
class _ corpus <- Corpus ( VectorSource ( text _ class _ data $ text ) )
class _ corpus <- tm _ map ( class _ corpus , content _ transformer ( tolower ) )
class _ corpus <- tm _ map ( class _ corpus , remov ePunct uation )
class _ corpus <- tm _ map ( class _ corpus , removeWords , stopwords ( " english " ) )
class _ dtm <- Do cu me nt Te rm Ma tr ix ( class _ corpus )
class _ matrix <- as . matrix ( class _ dtm )
# Prepare data for classification
class _ df <- data . frame ( class _ matrix , sentiment = text _ class _ data $ sentiment
)
# Split data ( in practice , you ’ d have more data )
set . seed (123)
train _ indices <- sample (1: nrow ( class _ df ) , 0.7 * nrow ( class _ df ) )
train _ data <- class _ df [ train _ indices , ]
test _ data <- class _ df [ - train _ indices , ]
# Train Naive Bayes model
nb _ model <- naiveBayes ( sentiment ~ . , data = train _ data )
# Make predictions
predictions <- predict ( nb _ model , test _ data )
print ( predictions )
# Confusion matrix ( if you have enough test data )
# confusionMatrix ( predictions , test _ data $ sentiment )
7.7 Web Scraping for Text Data
Basic Web Scraping:
# Web scraping with rvest package
library ( rvest )
# Note : Always check robots . txt and terms of service
# This is a general example - replace with appropriate URL
# Read HTML page
# url <- " https : / / example - news - site . com "
# page <- read _ html ( url )
# Extract text from specific elements
# headlines <- page % >%
# html _ nodes (". headline - class ") % >%
# html _ text ()
# Extract article content
# articles <- page % >%
# html _ nodes (". article - content ") % >%
# html _ text ()
19
# Create data frame
# news _ data <- data . frame (
# headline = headlines ,
# content = articles ,
# date = Sys . Date ()
# )
# Example with static data instead
news _ data <- data . frame (
headline = c ( " Breaking : ␣ Major ␣ Scientific ␣ Discovery " ,
" Economy ␣ Shows ␣ Signs ␣ of ␣ Recovery " ,
" New ␣ Environmental ␣ Policy ␣ Announced " ) ,
content = c ( " Scientists ␣ have ␣ made ␣ a ␣ groundbreaking ␣ discovery ... " ,
" Economic ␣ indicators ␣ suggest ␣ improvement ... " ,
" Government ␣ announces ␣ new ␣ environmental ␣ measures ... " ) ,
date = Sys . Date ()
)
print ( news _ data )
7.8 Social Media Text Analysis
Twitter-like Data Analysis:
# Simulated social media data
social _ data <- data . frame (
user = paste0 ( " user " , 1:20) ,
text = c ( " Love ␣ the ␣ new ␣ update ! ␣ # awesome " ,
" This ␣ app ␣ is ␣ so ␣ confusing ␣ # frustrated " ,
" Great ␣ customer ␣ service ! ␣ Thanks ! ␣ # happy " ,
" Why ␣ is ␣ this ␣ so ␣ slow ? ␣ # annoying " ,
" Amazing ␣ features ! ␣ Well ␣ done ! ␣ # impressed " ,
" Crashes ␣ all ␣ the ␣ time ␣ # buggy " ,
" Simple ␣ and ␣ effective ␣ # satisfied " ,
" Needs ␣ more ␣ features ␣ # disappointed " ,
" Perfect ␣ for ␣ my ␣ needs ␣ # grateful " ,
" Too ␣ expensive ␣ for ␣ what ␣ it ␣ offers ␣ # overpriced " ,
" Fantastic ␣ user ␣ interface ␣ # beautiful " ,
" Difficult ␣ to ␣ navigate ␣ # confusing " ,
" Fast ␣ and ␣ reliable ␣ # efficient " ,
" Missing ␣ key ␣ functionality ␣ # incomplete " ,
" Exceeded ␣ my ␣ expectations ␣ # surprised " ,
" Poor ␣ performance ␣ on ␣ mobile ␣ # sluggish " ,
" Excellent ␣ documentation ␣ # helpful " ,
" Hard ␣ to ␣ find ␣ basic ␣ features ␣ # complicated " ,
" Works ␣ exactly ␣ as ␣ advertised ␣ # honest " ,
" Frequent ␣ updates ␣ are ␣ annoying ␣ # frustrated " ) ,
timestamp = seq ( Sys . time () , by = " hour " , length . out = 20)
)
# Extract hashtags
library ( stringr )
hashtags <- str _ extract _ all ( social _ data $ text , " #\\ w + " )
20
all _ hashtags <- unlist ( hashtags )
hashtag _ freq <- table ( all _ hashtags )
print ( sort ( hashtag _ freq , decreasing = TRUE ) )
# Sentiment analysis of social media posts
social _ tokens <- social _ data % >%
unnest _ tokens ( word , text ) % >%
anti _ join ( stop _ words , by = " word " )
# Sentiment by post
post _ sentiment <- social _ tokens % >%
inner _ join ( get _ sentiments ( " bing " ) , by = " word " ) % >%
count ( user , sentiment ) % >%
spread ( sentiment , n , fill = 0) % >%
mutate ( sentiment _ score = positive - negative )
print ( post _ sentiment )
# Visualize sentiment over time
social _ data _ with _ sentiment <- social _ data % >%
left _ join ( post _ sentiment , by = " user " )
ggplot ( social _ data _ with _ sentiment , aes ( x = timestamp , y = sentiment _ score )
) +
geom _ line () +
geom _ point () +
geom _ hline ( yintercept = 0 , linetype = " dashed " , color = " red " ) +
labs ( title = " Sentiment ␣ Over ␣ Time " ,
x = " Time " , y = " Sentiment ␣ Score " )
7.9 Advanced Text Mining Techniques
N-grams Analysis:
# N - grams ( sequences of n words )
library ( tidytext )
# Bigrams (2 - word sequences )
text _ bigrams <- text _ sentiment % >%
unnest _ tokens ( bigram , text , token = " ngrams " , n = 2)
print ( text _ bigrams )
# Separate bigrams for analysis
bigrams _ separated <- text _ bigrams % >%
separate ( bigram , c ( " word1 " , " word2 " ) , sep = " ␣ " )
# Filter stop words
bigrams _ filtered <- bigrams _ separated % >%
filter ( ! word1 % in % stop _ words $ word ) % >%
filter ( ! word2 % in % stop _ words $ word )
# Most common bigrams
bigram _ counts <- bigrams _ filtered % >%
21
count ( word1 , word2 , sort = TRUE )
print ( bigram _ counts )
# Trigrams (3 - word sequences )
text _ trigrams <- text _ sentiment % >%
unnest _ tokens ( trigram , text , token = " ngrams " , n = 3)
print ( text _ trigrams )
Named Entity Recognition (Basic):
# Simple named entity extraction
# This is a basic example - for advanced NER , use spacyr or other NLP
packages
# Extract capitalized words ( potential proper nouns )
text _ example <- " John ␣ Smith ␣ visited ␣ New ␣ York ␣ City ␣ last ␣ week . ␣ He ␣ met ␣ with ␣
Apple ␣ Inc . ␣ executives . "
# Simple regex for capitalized words
entities <- str _ extract _ all ( text _ example , " \\ b [A - Z ][ a - z ]+(?:\\ s +[ A - Z ][ a - z
]+) * " )
print ( unlist ( entities ) )
# More sophisticated approach would use specialized NLP libraries
# library ( spacyr ) # Requires Python spaCy installation
# spacy _ initialize ()
# parsed <- spacy _ parse ( text _ example )
# entities <- entity _ extract ( parsed )
Text Similarity and Clustering:
# Document similarity using cosine distance
library ( textreuse )
# Create document hashes for similarity comparison
docs <- c ( " The ␣ quick ␣ brown ␣ fox ␣ jumps ␣ over ␣ the ␣ lazy ␣ dog " ,
" A ␣ fast ␣ brown ␣ fox ␣ leaps ␣ over ␣ a ␣ sleepy ␣ dog " ,
" Climate ␣ change ␣ affects ␣ global ␣ weather ␣ patterns " ,
" Global ␣ warming ␣ impacts ␣ worldwide ␣ climate ␣ systems " )
doc _ hashes <- minhash _ generator ( n = 100 , seed = 123)
doc _ corpus <- TextReuseCorpus ( text = docs ,
tokenizer = tokenize _ ngrams , n = 3 ,
minhash _ func = doc _ hashes )
# Calculate similarities
similarities <- pairwise _ compare ( doc _ corpus , jaccard _ similarity )
print ( similarities )
# Clustering based on text similarity
dist _ matrix <- as . dist (1 - similarities )
hc <- hclust ( dist _ matrix )
plot ( hc , main = " Document ␣ Clustering ␣ by ␣ Text ␣ Similarity " )
22