# R codes
library(ggplot2)
setwd('/Users/mengze/Desktop/R/501 data')
DF <- read.csv('tuition_cost.csv')
salary<-read.csv('salary_potential.csv')
college<-read.csv('College_data.csv')
#print out the structure/types of the data using str
cat(str(DF))
cat(str(salary))
cat(str(college))
# change the data type of column state in order to change the value.
DF$state<-as.character(DF$state)
DF$degree_length<-as.integer(ifelse(DF$degree_length=="4 Year",4,2))
names(salary)[names(salary)=='state_name']='state'
names(college)[names(college)=='X']='name'
# create new columns that label the data (classify expensive colleges from all)
# calculate the median of out_of_state_tuition
median(DF$out_of_state_tuition)
DF$expensive<-as.factor(ifelse((DF$out_of_state_tuition/DF$degree_length)>6000,'Yes','No'))
salary$salary=(salary$early_career_pay+salary$mid_career_pay)/2
median(salary$salary)
salary$payment<-as.factor(ifelse(salary$salary>70000,'high','low'))
# visualization that is a figure of a small portion of the dataset in non-clean format
barplot(table(DF$expensive),data=DF,
main="table for expensive shools",xlab='type',ylab='frequency',col=c('red','blue'))
barplot(table(DF$state),data=DF, main='number of colleges in each state')
barplot(table(salary$make_world_better_percent),data=salary)
# see the variation of values in datasets before data cleaning
var(salary$make_world_better_percent[complete.cases(salary)])
#to see what does dataset look like before cleaning
head(DF)
head(salary)
head(college)
# keep track of the number of rows before data cleaning
nrow(DF) #2973
nrow(salary) #935
nrow(college) #777
# find rows with incomplete data
DF[!complete.cases(DF),]
salary[!complete.cases(salary),]
college[!complete.cases(college),]
# in salary datasets, many rows are missing on make_world_better_percent columns.
# so we calculate the mean and median of them to see if we can replace them with mean or median
mean_wordbetter<-mean(salary$make_world_better_percent[complete.cases(salary)])
median(salary$make_world_better_percent[complete.cases(salary)])
# repalce the missing values
DF$state[DF$state_code=='AS']='American Samoa'
DF$state[DF$state_code=='PR']='Puerto Rico'
DF$state[DF$state_code=='DC']='District of Columbia'
DF$state[DF$state_code=='GU']='Guam'
DF$state[DF$state_code=='VI']='Virgin Islands'
salary[!complete.cases(salary),]$make_world_better_percent=mean_wordbetter
#Create new columns
college$Accept_rate<-college$Accept/college$Apps
college$Enroll_rate<-college$Enroll/college$Accept
DF$tuition=DF$out_of_state_tuition
# remove useless data columns and redundant data
DF<-DF[-c(3,6,7,8,9,10)]
salary<-salary[-c(1,3,4,5)]
college<-college[-c(3,4,5,8,9,11,12,13,15,16,17,18)]
# change format of values in columns
salary$make_world_better_percent=salary$make_world_better_percent/100
salary$stem_percent=salary$stem_percent/100
salary$make_world_better_percent=round(salary$make_world_better_percent,digits=2)
college$Grad.Rate=college$Grad.Rate/100
college$Accept_rate=round(college$Accept_rate,digits=2)
college$Enroll_rate=round(college$Enroll_rate,digits=2)
# see datasets after data cleaning
head(DF)
head(salary)
head(college)
# Combine three datasets into a new dataset
newDF<-merge(DF,college,by='name')
nrow(newDF)# 534
#cleaning
newDF<-newDF[-7]
finalDF<-merge(newDF,salary,by='name')
# seek for missing values
finalDF[!complete.cases(finalDF)]
#clean new dataset
finalDF<-finalDF[-7]
# to see the outliners in newDF by visualization
boxplot(newDF$Accept_rate~newDF$type,data=newDF,xlab='type',ylab='accept rate')
boxplot(newDF$tuition~newDF$degree_length,data=newDF,xlab='degree_length',ylab='tuition')
# see did the data cleaning change the variation of values in datasets.
var(salary$make_world_better_percent)
# visualization that is a figure of a small portion of the dataset after data cleaning
barplot(table(DF$expensive),data=DF,
main="table for expensive shools",xlab='type',ylab='frequency',col=c('red','blue'))
barplot(table(DF$state),data=DF, main='number of colleges in each state')
barplot(table(salary$make_world_better_percent),data=salary)
# keep track of the number of rows after data cleaning
nrow(DF) # 2973
nrow(salary) #935
nrow(college) #777
nrow(newDF) #534