Getting Started with R

There are multiple tools to get started with R, I have explored with Anaconda as that gives me flexibility of using Python in the same IDE. Within Anaconda you can either install R studio or use Jupyter notebook.

Once you have installed Anaconda, go to command prompt and create a new environment

conda env create -f requirements/my-environment.yml

After that activate the environment

source activate my-environment

OR
create a notebook

jupyter notebook test_R_notebook.ipynb

Once you have your R up and running, either in R studio or Jupyter notebook, here are a few basic commands to get started.

# Read file
mydata<-read.csv("path/filename.csv")
or
mydata<-read.csv("path/filename.csv", header=TRUE)
# Print data
mydata

# converting a text data to integers 
employeeDataNum$department<-as.numeric(employeeDataNum$department)

# remove NA values 
mydata<-na.omit(mydata)
mydata

# Replace NA with average
mydata$column[is.na(mydata$column)] <- round(mean(mydata$column, na.rm = TRUE))

# Plot bars for items 
data_subset<-mydata[c(7,8:20)]
data_subset<-ifelse(data_subset=='yes', 1,0)
barplot(data_subset)

# plot for boxlot
boxplot(data$column)


Check your library paths
Sys.getenv("R_LIBS_USER")

#Install a package 
install.packages("AER")
#with dependencies
install.packages("AER", dependencies=TRUE)
#include a library
library(dplyr)

# Getting specific columns
datanew<-mydata[,c(7,8,9,10)]

Divide data set into training and test sets
set.seed(4)
inTraing<-sample(2,nrow(mydata),prob=c(0.7,0.3),replace=T)
trainset<-mydata[inTraing==1,]
testset<-mydata[inTraing==2,]

# Applying alog on training data
linermodel<-lm(trainset$Other_players~.,data = trainset)
linermodel

# Predict for test ddata
predict<-predict(linermodel,testset)

# plot
testsubset<-testset[1:100,]
plot(testsubset$Other_players[1:100], type="l")
lines(predict[1:100],col="red")

# Finding correlation among columns
correlation <- cor(mydata)
install.packages('corrplot', dependencies=TRUE)
library(corrplot)
corrplot(correlation,type='lower')

# Subsetting data based on some conditiom
employee_left<-subset(employeeData, left==1)
employee_left

# More plotting
plot(employeeData$salary)
hist(employeeData$last_evaluation)

# Summary
summary(employeeData)

# creating decision tree
library(rpart)
my_tree<-rpart(formula = formulacolumn ~ .,data=traindata)
plot(my_tree, margin=0.1)
text(my_tree,pretty=T,cex=0.7)

# Confusion matrix

predtree<-predict(my_tree,testdata,type="class")
install.packages('e1071', dependencies=TRUE)
library(caret)
confusionMatrix(table(predtree,testdata$leftlibrary(randomForest)))

# using random forest for analysis
library(randomForest)
employee_forest<-randomForest(left~.,data=traindata)
predforest<-predict(employee_forest,testdata,type="class")
confusionMatrix(table(predforest,testdata$left))

# using naive bayes
library(e1071)
employee_naive<-naiveBayes(left~.,data=traindata)
pred_naive<-predict(employee_naive,testdata,type="class")
confusionMatrix(table(pred_naive,testdata$left))

# using svm
employee_svm<-svm(left~.,data=traindata)
pred_svm<-predict(employee_svm,testdata,type="class")
confusionMatrix(table(pred_svm,testdata$left))