There are multiple tools to get started with R, I have explored with Anaconda as that gives me flexibility of using Python in the same IDE. Within Anaconda you can either install R studio or use Jupyter notebook.
Once you have installed Anaconda, go to command prompt and create a new environment
conda env create -f requirements/my-environment.yml
After that activate the environment
source activate my-environment
OR
create a notebook
jupyter notebook test_R_notebook.ipynb
Once you have your R up and running, either in R studio or Jupyter notebook, here are a few basic commands to get started.
# Read file
mydata<-read.csv("path/filename.csv")
or
mydata<-read.csv("path/filename.csv", header=TRUE)
# Print data
mydata
# converting a text data to integers
employeeDataNum$department<-as.numeric(employeeDataNum$department)
# remove NA values
mydata<-na.omit(mydata)
mydata
# Replace NA with average
mydata$column[is.na(mydata$column)] <- round(mean(mydata$column, na.rm = TRUE))
# Plot bars for items
data_subset<-mydata[c(7,8:20)]
data_subset<-ifelse(data_subset=='yes', 1,0)
barplot(data_subset)
# plot for boxlot
boxplot(data$column)
Check your library paths
Sys.getenv("R_LIBS_USER")
#Install a package
install.packages("AER")
#with dependencies
install.packages("AER", dependencies=TRUE)
#include a library
library(dplyr)
# Getting specific columns
datanew<-mydata[,c(7,8,9,10)]
Divide data set into training and test sets
set.seed(4)
inTraing<-sample(2,nrow(mydata),prob=c(0.7,0.3),replace=T)
trainset<-mydata[inTraing==1,]
testset<-mydata[inTraing==2,]
# Applying alog on training data
linermodel<-lm(trainset$Other_players~.,data = trainset)
linermodel
# Predict for test ddata
predict<-predict(linermodel,testset)
# plot
testsubset<-testset[1:100,]
plot(testsubset$Other_players[1:100], type="l")
lines(predict[1:100],col="red")
# Finding correlation among columns
correlation <- cor(mydata)
install.packages('corrplot', dependencies=TRUE)
library(corrplot)
corrplot(correlation,type='lower')
# Subsetting data based on some conditiom
employee_left<-subset(employeeData, left==1)
employee_left
# More plotting
plot(employeeData$salary)
hist(employeeData$last_evaluation)
# Summary
summary(employeeData)
# creating decision tree
library(rpart)
my_tree<-rpart(formula = formulacolumn ~ .,data=traindata)
plot(my_tree, margin=0.1)
text(my_tree,pretty=T,cex=0.7)
# Confusion matrix
predtree<-predict(my_tree,testdata,type="class")
install.packages('e1071', dependencies=TRUE)
library(caret)
confusionMatrix(table(predtree,testdata$leftlibrary(randomForest)))
# using random forest for analysis
library(randomForest)
employee_forest<-randomForest(left~.,data=traindata)
predforest<-predict(employee_forest,testdata,type="class")
confusionMatrix(table(predforest,testdata$left))
# using naive bayes
library(e1071)
employee_naive<-naiveBayes(left~.,data=traindata)
pred_naive<-predict(employee_naive,testdata,type="class")
confusionMatrix(table(pred_naive,testdata$left))
# using svm
employee_svm<-svm(left~.,data=traindata)
pred_svm<-predict(employee_svm,testdata,type="class")
confusionMatrix(table(pred_svm,testdata$left))