# R codes

library(ggplot2)

setwd('/Users/mengze/Desktop/R/501 data')

DF <- read.csv('tuition_cost.csv')

salary<-read.csv('salary_potential.csv')

college<-read.csv('College_data.csv')

#print out the structure/types of the data using str

cat(str(DF))

cat(str(salary))

cat(str(college))

# change the data type of column state in order to change the value.

DF$state<-as.character(DF$state)

DF$degree_length<-as.integer(ifelse(DF$degree_length=="4 Year",4,2))

names(salary)[names(salary)=='state_name']='state'

names(college)[names(college)=='X']='name'

# create new columns that label the data (classify expensive colleges from all)

# calculate the median of out_of_state_tuition

median(DF$out_of_state_tuition)

DF$expensive<-as.factor(ifelse((DF$out_of_state_tuition/DF$degree_length)>6000,'Yes','No'))

salary$salary=(salary$early_career_pay+salary$mid_career_pay)/2

median(salary$salary)

salary$payment<-as.factor(ifelse(salary$salary>70000,'high','low'))

# visualization that is a figure of a small portion of the dataset in non-clean format

barplot(table(DF$expensive),data=DF,

main="table for expensive shools",xlab='type',ylab='frequency',col=c('red','blue'))

barplot(table(DF$state),data=DF, main='number of colleges in each state')

barplot(table(salary$make_world_better_percent),data=salary)

# see the variation of values in datasets before data cleaning

var(salary$make_world_better_percent[complete.cases(salary)])

#to see what does dataset look like before cleaning

head(DF)

head(salary)

head(college)

# keep track of the number of rows before data cleaning

nrow(DF) #2973

nrow(salary) #935

nrow(college) #777

# find rows with incomplete data

DF[!complete.cases(DF),]

salary[!complete.cases(salary),]

college[!complete.cases(college),]

# in salary datasets, many rows are missing on make_world_better_percent columns.

# so we calculate the mean and median of them to see if we can replace them with mean or median

mean_wordbetter<-mean(salary$make_world_better_percent[complete.cases(salary)])

median(salary$make_world_better_percent[complete.cases(salary)])

# repalce the missing values

DF$state[DF$state_code=='AS']='American Samoa'

DF$state[DF$state_code=='PR']='Puerto Rico'

DF$state[DF$state_code=='DC']='District of Columbia'

DF$state[DF$state_code=='GU']='Guam'

DF$state[DF$state_code=='VI']='Virgin Islands'

salary[!complete.cases(salary),]$make_world_better_percent=mean_wordbetter

#Create new columns

college$Accept_rate<-college$Accept/college$Apps

college$Enroll_rate<-college$Enroll/college$Accept

DF$tuition=DF$out_of_state_tuition

# remove useless data columns and redundant data

DF<-DF[-c(3,6,7,8,9,10)]

salary<-salary[-c(1,3,4,5)]

college<-college[-c(3,4,5,8,9,11,12,13,15,16,17,18)]

# change format of values in columns

salary$make_world_better_percent=salary$make_world_better_percent/100

salary$stem_percent=salary$stem_percent/100

salary$make_world_better_percent=round(salary$make_world_better_percent,digits=2)

college$Grad.Rate=college$Grad.Rate/100

college$Accept_rate=round(college$Accept_rate,digits=2)

college$Enroll_rate=round(college$Enroll_rate,digits=2)

# see datasets after data cleaning

head(DF)

head(salary)

head(college)

# Combine three datasets into a new dataset

newDF<-merge(DF,college,by='name')

nrow(newDF)# 534

#cleaning

newDF<-newDF[-7]

finalDF<-merge(newDF,salary,by='name')

# seek for missing values

finalDF[!complete.cases(finalDF)]

#clean new dataset

finalDF<-finalDF[-7]

# to see the outliners in newDF by visualization

boxplot(newDF$Accept_rate~newDF$type,data=newDF,xlab='type',ylab='accept rate')

boxplot(newDF$tuition~newDF$degree_length,data=newDF,xlab='degree_length',ylab='tuition')

# see did the data cleaning change the variation of values in datasets.

var(salary$make_world_better_percent)

# visualization that is a figure of a small portion of the dataset after data cleaning

barplot(table(DF$expensive),data=DF,

main="table for expensive shools",xlab='type',ylab='frequency',col=c('red','blue'))

barplot(table(DF$state),data=DF, main='number of colleges in each state')

barplot(table(salary$make_world_better_percent),data=salary)

# keep track of the number of rows after data cleaning

nrow(DF) # 2973

nrow(salary) #935

nrow(college) #777

nrow(newDF) #534