# For EDF 9770 Spring 2026 Lecture 1 Slides

# Lines that start with # are COMMENTS (notes to yourself)
# Each R command is contained in parentheses
# Data transformations (e.g., making new variables) must refer to a dataset 
# USE COLORS TO HELP YOU! Comments are GREEN, some system commands are BLUE,
#   Variables and file names are BLACK, labels and titles are also green
#   (but you can customize all colors under Tools --> Global Options)
# All x= arguments below refer to the name of the dataset and/or variable to be used

# To execute select code, highlight it -- then click the run icon above,
#   or use keyboard short-cut of control+enter in Windows or command+enter in Mac

################################################################################
#####                LECTURE 1 OPTIONS AND PACKAGES                        #####
################################################################################

# Set width of output and minimum number of digits printed,
# number of digits before switching to scientific notation
options(width=120, digits=8, scipen=9)

#####  Check to see if packages are downloaded, install if not; then load  #####

# To add input to text output files
if (!require("TeachingDemos")) install.packages("TeachingDemos"); library(TeachingDemos)

# To import excel .xls or .xlsx data as table
if (!require("readxl")) install.packages("readxl"); library(readxl) 

# To summarize quantitative data
if (!require("psych")) install.packages("psych"); library(psych) 

# To import data from Minitab, SAS, SPSS, Stata, Systat, 
# Just load this base R function (remove # to run as needed) 
#library(foreign)

# Clear workspace (remove # and run as needed for troubleshooting purposes)
#rm(list = ls())


################################################################################
#####            LECTURE 1 DATA IMPORT AND MANIPULATION                    #####
################################################################################

# Set working directory (to import and export files to)
# Paste in the folder address where your data file is saved in quotes
# Note the slashes are backwards relative to Windows file paths
setwd("C:/Dropbox/26_EDF9770/Lecture1/")

# Import "GSS_Example.xlsx" from sheet "Data" with first row as variable names
Example1 = read_excel(path="GSS_Example.xlsx", sheet="Data", col_names=TRUE) 
# Convert to data frame to use for analysis
Example1 = as.data.frame(Example1)

# Label variables used below (add descriptive titles) using comments instead
#   because variable labels are not allowed in many analysis packages
# Format: name = Descriptive Variable Label
# marital = 5-Category Marital Status
# happy   = 5-Category Happy Rating
# income  = Annual Income in 1000s

# sink with split=TRUE sends console output to a text file too
# append=FALSE overwrites existing file (re-run to start over)
txtStart(file="Lecture1_Output.txt")

# Work-around to add value labels for categorical variables marital and happy
# Make a concatenated list of labels in order of values to be labeled
maritalLabels = c("1.Married", "2.Widowed", "3.Divorced", 
                  "4.Separated", "5.Never")
happyLabels   = c("1.Unhappy", "2.Neither", "3.Fairly", 
                  "4.Very", "5.Completely")

# Make new text-format string variables with the labels instead of values
Example1$maritalLabeled = maritalLabels[Example1$marital]
Example1$happyLabeled   = happyLabels[Example1$happy]


################################################################################
#####            BEGIN EXAMPLE 1 UNIVARIATE STATISTICS                     #####
################################################################################

# For categorical variables

# table prints frequency-only tables for categorical variables 
# useNA="ifany" includes missing values too  
print("R Frequency Table for Categorical Variable maritalLabeled")
table(x=Example1$maritalLabeled, useNA="ifany")

# prop.table converts tabled frequencies into proportions
print("R Proportion Table for Categorical Variable maritalLabeled") 
prop.table(table(x=Example1$maritalLabeled, useNA="ifany"))

# barplot can generate frequency plots for numeric variables
# here is a work-around to make it use our string maritalLabeled variable
barplot(height=table(x=Example1$maritalLabeled, useNA="ifany"),
        ylab="Frequency", xlab="Marital Status") # y and x axis labels

# trick barplot into plotting percentages instead
barplot(height=prop.table(table(x=Example1$maritalLabeled, useNA="ifany"))*100,
        ylab="Percentage", xlab="Marital Status") # y and x axis labels

# to save a plot: open a file, create the plot, then close the file
png(file="R Marital Percentage Plot.png")  # open file
barplot(height=prop.table(table(x=Example1$maritalLabeled,useNA="ifany"))*100,
        ylab="Percentage", xlab="Marital Status") # y and x axis labels
dev.off()  # close file

# why frequency tables and bar plots are not useful for quantitative variables
table(x=Example1$income, useNA="ifany")
barplot(height=table(x=Example1$income, useNA="ifany"),
        ylab="Frequency", xlab="Income") # y and x axis labels

################################################################################
# For quantitative variables

# describe (from psych package) prints descriptive statistics for quantitative variables
# quant= requests list of quantiles, IQR requests inter-quartile range
# [ , c()] part says use all rows, but just columns named in c()
print("R Descriptive Statistics for Quantitative Variables income and age")
describe(x=Example1[ , c("income","age")], quant=c(.25,.50,.75), IQR=TRUE)

# embedding describe inside a print function allows better control of number of digits printed
print(describe(x=Example1[ , c("income","age")], quant=c(.25,.50,.75), IQR=TRUE), digits=3)

# to make sure it is using the describe function from the psych package, write it this way
print(psych::describe(x=Example1[ , c("income","age")], quant=c(.25,.50,.75), IQR=TRUE), digits=3)

# describe does not include variance, so here is a base R command to do so
var(x=Example1$income)

# likewise, here are base R commands to get the mean and SD separately with more precision
# can have more than one command on a line if separated by a semi-colon
mean(x=Example1$income); sd(x=Example1$income) 

# histogram for income in frequency with 15 bins
hist(x=Example1$income, freq=TRUE, breaks=15, 
     ylab="Density", xlab="Annual Income in 100s") # y and x axis labels

# to save a plot: open a file, create the plot, then close the file
png(file="R Income Histogram Plot.png")  # open file
hist(x=Example1$income, freq=TRUE, breaks=15, 
     ylab="Density", xlab="Annual Income in 1000s") # y and x axis labels
dev.off()  # close file

# one more example: exploring an ordinal variable
describe(x=Example1[ , "happy"])
png(file="R Happy Percentage Plot.png")  # open file
barplot(height=prop.table(table(x=Example1$happyLabeled))*100,
        ylab="Percentage", xlab="Happiness Rating") # axis labels
dev.off()  # close file 


################################################################################
# Close output text file
txtStop()

