Read the instruction carefully and think about how to develop R code to answer each questions.
iris
and
basic data manipulation. If you have issues with, please do recap
in workshop 04. Also, there are several trick and tip that you
should aware before beginning the workshop.sink()
, You can
copy table or list to clipboard/memory, equivalent to [Ctrl]+ [C], with
write.table()
x.out <- head(iris)
write.table(x.out,"clipboard",sep=" \t")
View()
view.okc <- function(x,row.names=FALSE,col.names=TRUE,...) {
write.table(x,"clipboard",sep="\t",row.names=row.names,col.names=col.names,...)
##write.table(x,"clipboard.out",sep=" \t",row.names=row.names,col.names=col.names,...)
View(x)
}
#view.okc(iris)
note NON-ASCII character encoding and how OS distinguish Thai in memory can lead to problem, you may need to pipe as text file before copy to excel
iris.temp <- iris
iris.temp$id <- 1:nrow(iris)
##-- base RECAP
set.seed(17)
iris.smpl <- iris.temp[sample.int(nrow(iris.temp),size=30),]
##-- data.table
set.seed(17)
iris.DT <- data.table::as.data.table(iris.temp)
iris.smpl <- iris.DT[id%in% sample.int(.N,30)]
##-- base RECAP
nRow <- nrow(iris)
iris.sys3 <- iris.temp[seq(1,nRow,3),]
##-- data.table
iris.sys3 <- iris.DT[id%in% seq(1,nRow,3)]
##-- base/sampling
require(sampling)
set.seed(17)
strat.idx <- strata(iris.DT,stratanames ="Species",size=c(10,10,10))$ID_unit
iris.strat<- iris.DT[id%in%strat.idx]
##-- data.table
set.seed(17)
strat.idx <- iris.DT[,sample(.I,10),by=Species]$V1
iris.strat <- iris.DT[id%in%strat.idx]
petal.width
and
petal.length
in the previous Workshop as
tWidth
and tLenght
, respectivelytype | width of petal | length of petal |
---|---|---|
low | \([0.00,0.75)\) | \([0.0,2.5)\) |
medium | \([0.75,1.75)\) | \([2.5,5.0)\) |
high | \([1.75,\infty)\) | \([5.0,\infty)\) |
set.seed(17)
clus.idx <- iris.DT[,sample(.I,3),by=.(tWidth,tLength)]$V1
iris.clus <- iris.DT[id%in%clus.idx]
head(iris.clus[,1:5])
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1: 5.2 4.1 1.5 0.1 setosa
## 2: 5.1 3.4 1.5 0.2 setosa
## 3: 5.0 3.3 1.4 0.2 setosa
## 4: 5.5 2.4 3.8 1.1 versicolor
## 5: 5.7 3.0 4.2 1.2 versicolor
## 6: 5.1 2.5 3.0 1.1 versicolor
Many data mining technique require scaling
\[ \acute{u} = \frac{u - \mu_u}{\sigma_u} \]
avg.length <- mean(iris$Sepal.Length)
sd.length <- sd(iris$Sepal.Length)
norm.length<- (iris$Sepal.Length - avg.length)/sd.length
head(cbind(iris$Sepal.Length,scale(iris$Sepal.Length),norm.length))
## norm.length
## [1,] 5.1 -0.8976739 -0.8976739
## [2,] 4.9 -1.1392005 -1.1392005
## [3,] 4.7 -1.3807271 -1.3807271
## [4,] 4.6 -1.5014904 -1.5014904
## [5,] 5.0 -1.0184372 -1.0184372
## [6,] 5.4 -0.5353840 -0.5353840
Manhattan Distance A grid distance.
\[ d_{\mbox{man}}(\mathbf{p},\mathbf{q}) = |p_x -q_x| + |p_y - q_y| \]
Euclidean Distance A plane distance.
\[ d_{\mbox{2}}(\mathbf{p},\mathbf{q}) = \sqrt{ (p_x - q_x)^2 + (p_y - q_y)^2 } \]
Chebyshev Distance A minimax distance for multiple/independent direction distance, e.g., AS/RS
\[ d_{\infty}(\mathbf{p},\mathbf{q}) = \max\left( |p_x -q_x|, |p_y - q_y| \right) \]
p-Distance
\[ d_{\mbox{p}}(\mathbf{p},\mathbf{q}) = \sqrt[p]{ \sum_i (p_i -q_i)^p } \]
dist()
data <- as.matrix( cbind(iris$Sepal.Width,iris$Sepal.Length) )
dist(data[1:5,],diag=T,method="manhattan")
## 1 2 3 4 5
## 1 0.0
## 2 0.7 0.0
## 3 0.7 0.4 0.0
## 4 0.9 0.4 0.2 0.0
## 5 0.2 0.7 0.7 0.9 0.0
\[ d_{M}(\mathbf{p},\mathbf{Q}) = \sqrt{ (\mathbf{p} - \boldsymbol{\mu})^{T} \cdot \mathbf{S}^{-1} \cdot (\mathbf{p} - \boldsymbol{\mu})} \] when,
\(\boldsymbol{\mu}\) = mean vector of data matrix \(\mathbf{Q}\) \(\mathbf{S}\) = covariance matrix of data matrix \(\mathbf{Q}\)
colMaha <- function(p,Q.mat,tol=1e-3){
## Q.mat <- iris[,1:4]
## p <- c(5,3,3,2)
mu <- colMeans(Q.mat)
cov.mat <- cov(Q.mat)
nDim <- nrow(cov.mat)
if(abs(det(cov.mat)) < tol){
cov.mat <- cov.mat+runif(nDim)*dig(nDim)
}
cov.inv <- solve(cov.mat)
result2 <- t(p-mu) %*% cov.inv %*% (p-mu)
return(sqrt(result2[1]))
}
iris
dataset, this question of workshp will
show different non-tree based methods (a.k.a non-rule
based) to classify data and its limitation. It is important to note
that:iris.test
using set.seed(17)
. The remainer will be called
iris.train
cbind(head(iris.train[,-(1:4)]),head(iris.test[,-(1:4)]))
## Species vSpecies Species vSpecies
## 1 setosa 0 setosa 0
## 2 setosa 0 setosa 0
## 3 setosa 0 setosa 0
## 4 setosa 0 setosa 0
## 5 setosa 0 setosa 0
## 6 setosa 0 setosa 0
knn()
iris.knn.1 <- class::knn(train = iris.train[,1:4], test = iris.test[,1:4],cl = iris.train[,5], k= 1)
iris.knn.9 <- class::knn(train = iris.train[,1:4], test = iris.test[,1:4],cl = iris.train[,5], k= 9)
comp.DT <- data.table::as.data.table( cbind(iris.test[,5],iris.knn.1,iris.knn.9) )
plot(iris.test[,1:2],col=as.numeric(iris.test$Species),pch=16,cex=0.8)
points(iris.test[,1:2],col=as.numeric(iris.knn.1),pch=1,cex=1.2)
caret
packageclass::knn
works for both
matrix
and data.frame
, but not
data.table
class::knn
can take non-factor as class iris.temp <- class::knn(train = iris.train[,1:4], test = iris.test[,1:4],cl = iris.train[,5], k= 1)
ftable(iris.temp,iris.knn.1)
## iris.knn.1 setosa versicolor virginica
## iris.temp
## setosa 10 0 0
## versicolor 0 10 0
## virginica 0 0 10
scaling <- 100
iris.train$Sepal.Length <- scaling*iris.train$Sepal.Length
iris.test$Sepal.Length <- scaling*iris.test$Sepal.Length
iris.knn.S1 <- class::knn(train = iris.train[,1:4], test = iris.test[,1:4],cl = iris.train[,5], k= 1)
iris.knn.S9 <- class::knn(train = iris.train[,1:4], test = iris.test[,1:4],cl = iris.train[,5], k= 9)
iris.train$Sepal.Length <- (1/scaling)*iris.train$Sepal.Length
iris.test$Sepal.Length <- (1/scaling)*iris.test$Sepal.Length
plot(iris.test[,1:2],col=as.numeric(iris.test$Species),pch=16,cex=0.8)
points(iris.test[,1:2],col=as.numeric(iris.knn.S9),pch=1,cex=1.4)
iris.min <- floor(apply(iris[,1:4],2,min))
iris.max <- ceiling(apply(iris[,1:4],2,max))
gap.grid <- 0.5
iris.grid <- as.data.frame(expand.grid(seq(iris.min[1],iris.max[1],gap.grid),seq(iris.min[2],iris.max[2],gap.grid),
seq(iris.min[3],iris.max[3],gap.grid),seq(iris.min[4],iris.max[4],gap.grid) ))
colnames(iris.grid) <- colnames(iris.train[,1:4])
iris.vor.1 <- class::knn(train = iris.train[,1:4], test = iris.grid[,1:4],cl = iris.train[,5], k= 1)
plot(jitter(iris.grid[,3],3),jitter(iris.grid[,4],3),col=as.numeric(iris.vor.1),pch=1,cex=0.1)
points(iris.DT[[3]],iris.DT[[4]],pch=16,cex=1,col=iris.DT[[5]])
pairs(apply(iris.grid,2,jitter,factor=3),col=as.numeric(iris.vor.1),pch=1,cex=0.5)
knn
? require(neuralnet)
set.seed(17)
iris.ann <- neuralnet(Species~.,data = iris.train,hidden=c(2))
plot(iris.ann)
require(e1071)
iris.svm <- svm(Species~Petal.Length+Petal.Width,data=iris.train,kenel="linear",scale = F)
species.svm <- predict(iris.svm,newdata=iris.test)
ftable(species.svm,iris.test$Species)
## setosa versicolor virginica
## species.svm
## setosa 10 0 0
## versicolor 0 9 1
## virginica 0 1 9
plot(iris.svm,iris.test,Petal.Length~Petal.Width,svSymbol = 8, dataSymbol =16,
symbolPalette=1:4,color.palette=terrain.colors)
e1071
provides tune
function using grid serach. Here are possible
tuning parameter of SVM: iris.svm.tune <- tune(svm, Species~., data=iris.train, ,kernel="polynomial",
ranges=list( cost= 10^(-3:3), gamma=seq(0,5,1.0)))
## summary(iris.svm.tune)
plot(iris.svm.tune,col=terrain.colors )
species.best <- predict(iris.svm.tune$best.model,newdata = iris.test)
ftable(species.best,iris.test$Species)
## setosa versicolor virginica
## species.best
## setosa 10 0 0
## versicolor 0 10 0
## virginica 0 0 10
iris
dataset, this
question of workshp will show different Tree Based
Techquie for classify data and its limitation. It is important to note
that:rpart
require(rpart)
iris.train$vSpecies <- NULL
iris.rpart <- rpart::rpart(Species~.,data=iris.train[,1:5],
control=rpart.control(minbucket = 1)
)
rpart.plot::prp(iris.rpart,extra=104,type=1)
##-- summary
## summary(iris.rpart)
note This is a repeat of the previous workshop
rpart
requires monitoring of
Complexity Parameter (\(cp\)), a measurement of miss-classification
after cross-validation, to prevent overfitting. A user can also control
\(cp\) and other paramerter using
rpart.control()
\[ cp = \sum_{i \in \mbox{t.node}} n_i +\lambda_{cv} \, s_i \]
rpart::plotcp(iris.rpart)
rpart::plotcp()
to
provide threshold (dotted line) and prune overfitting brach iris.rpart <- rpart(Species~.,data=iris.train,control=rpart.control(cp=-0.1))
rpart::plotcp(iris.rpart)
iris.prune <- prune(iris.rpart,cp = 0.01)
rpart.plot::prp(iris.prune,extra=104,type=1)
species.rpart <- apply(predict(iris.rpart,newdata=iris.test),1,which.max)
ftable(species.rpart,iris.test$Species)
## setosa versicolor virginica
## species.rpart
## 1 10 0 0
## 2 0 9 2
## 3 0 1 8
iris.rpart$variable.importance
## Petal.Width Petal.Length Sepal.Length Sepal.Width
## 74.72615 70.05980 42.40057 27.37209
rpart.plot::rpart.rules(iris.rpart)
## Species seto vers virg
## setosa [1.00 .00 .00] when Petal.Length < 2.5
## versicolor [ .00 .57 .43] when Petal.Length >= 4.8 & Petal.Width < 1.8
## versicolor [ .00 1.00 .00] when Petal.Length is 2.5 to 4.8 & Petal.Width < 1.8
## virginica [ .00 .00 1.00] when Petal.Length >= 2.5 & Petal.Width >= 1.8
ctree()
ctree()
from
party
package require(party)
iris.ctree <- ctree(Species~.,data=iris.train)
plot(iris.ctree)
predict()
species.ctree <- predict(iris.ctree,newdata=iris.test)
ftable(species.ctree,iris.test$Species)
## setosa versicolor virginica
## species.ctree
## setosa 10 0 0
## versicolor 0 9 2
## virginica 0 1 8
caret::confusionMatrix()
caret::confusionMatrix(species.ctree,iris.test$Species)
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 10 0 0
## versicolor 0 9 2
## virginica 0 1 8
##
## Overall Statistics
##
## Accuracy : 0.9
## 95% CI : (0.7347, 0.9789)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : 1.665e-10
##
## Kappa : 0.85
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 0.9000 0.8000
## Specificity 1.0000 0.9000 0.9500
## Pos Pred Value 1.0000 0.8182 0.8889
## Neg Pred Value 1.0000 0.9474 0.9048
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3000 0.2667
## Detection Prevalence 0.3333 0.3667 0.3000
## Balanced Accuracy 1.0000 0.9000 0.8750
require(randomForest)
set.seed(17)
iris.rf <- randomForest(Species~.,data=iris.train, localImp = TRUE)
iris.rf
##
## Call:
## randomForest(formula = Species ~ ., data = iris.train, localImp = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 4.17%
## Confusion matrix:
## setosa versicolor virginica class.error
## setosa 40 0 0 0.000
## versicolor 0 38 2 0.050
## virginica 0 3 37 0.075
##-- important attribute
iris.rf$importance
## setosa versicolor virginica MeanDecreaseAccuracy
## Sepal.Length 0.03371129 0.007949114 0.050464549 0.030673766
## Sepal.Width 0.01053397 0.002547001 -0.002850179 0.003611761
## Petal.Length 0.37182159 0.302525317 0.337814029 0.331620679
## Petal.Width 0.30752352 0.291394597 0.308560730 0.300435250
## MeanDecreaseGini
## Sepal.Length 6.066042
## Sepal.Width 1.823676
## Petal.Length 35.726250
## Petal.Width 35.675399
plot(iris.rf)
##-- voting
head(iris.rf$votes)
## setosa versicolor virginica
## 1 1 0 0
## 2 1 0 0
## 3 1 0 0
## 4 1 0 0
## 5 1 0 0
## 6 1 0 0
randomForestExplainer
package.
min_depth_distribution()
shows number of trees and average
min depth that use each attribute. require(randomForestExplainer)
min_depth_frame <- min_depth_distribution(iris.rf)
head(min_depth_frame, n = 5)
## tree variable minimal_depth
## 1 1 Petal.Length 0
## 2 1 Petal.Width 3
## 3 1 Sepal.Length 2
## 4 1 Sepal.Width 3
## 5 2 Petal.Length 0
plot_min_depth_distribution(min_depth_frame)
measure_importance()
importance_frame <- measure_importance(iris.rf)
importance_frame
## variable mean_min_depth no_of_nodes accuracy_decrease gini_decrease
## 1 Petal.Length 0.930274 1017 0.331620679 35.726250
## 2 Petal.Width 1.006135 986 0.300435250 35.675399
## 3 Sepal.Length 2.560998 447 0.030673766 6.066042
## 4 Sepal.Width 3.111526 389 0.003611761 1.823676
## no_of_trees times_a_root p_value
## 1 480 234 1.059154e-37
## 2 489 197 4.783565e-31
## 3 301 67 1.000000e+00
## 4 258 2 1.000000e+00
plot_multi_way_importance(importance_frame, size_measure = "no_of_nodes")
important_variables()
plot_importance_ggpairs(importance_frame)
vars <- important_variables(importance_frame, k = 3, measures = c("mean_min_depth", "no_of_trees"))
vars
## [1] "Petal.Length" "Petal.Width" "Sepal.Length"
Because of the simplification of iris
, many users may
unaware of \(C_p\) and overfiting of
decision tree `party::rpart()``.
require(data.table)
iris.MT <- as.data.table(iris)
iris.MT[,id:=.I]
##-- create numerical value of Species
iris.DT[,vSpecies:=as.numeric(Species)-1]
##-- randomize original attributes
set.seed(17)
for(i in 1:4){
iris.MT[[i]] <- iris.MT[[i]] + 2*abs( rnorm(nrow(iris.MT) ) )
}
##-- create new attributes
iris.MT[,widLog := log2(Sepal.Width) + log10(Petal.Width) ]
iris.MT[,lenLog := log10(Petal.Width) + log2(Petal.Width) ]
iris.MT[,widRatio:= sqrt(Sepal.Width)/Petal.Width]
iris.MT[,lenRatio:= Sepal.Length/sqrt(Petal.Length)]
iris.MT[,sepRatio:= Sepal.Length/sqrt(Sepal.Width) ]
iris.MT[,petRatio:= sqrt(Petal.Length)/Petal.Width]
##-- remove original attributes
iris.MT[,Sepal.Width:=NULL]
iris.MT[,Sepal.Length:=NULL]
iris.MT[,Petal.Width:=NULL]
iris.MT[,Petal.Length:=NULL]
set.seed(17)
strat.idx <- iris.MT[,sample(.I,10),by=Species]$V1
iris.train <- iris.MT[id%in%setdiff(1:150,strat.idx)]
iris.test <- iris.MT[id%in%strat.idx]
iris.train[,id:=NULL]
iris.test[,id:=NULL]
Consider dataset Cars93
in
MASS
package, explore the data, answer the
following question, and apply the classification technique to predict
whether the origin of a car is US or Non-US.
duplicated
. The dataset
contains no duplicated recored. require(data.table)
cars.DT <- as.data.table(MASS::Cars93)
##-- check for any duplicated data
any(duplicated(cars.DT))
## [1] FALSE
data.table
)na
(not available: NA
) and
contemplate actions. To count columns/attributes that
contains na
, we need to use some trick, particularly query
num of na
and create a new data.table. nCol <- ncol(cars.DT)
##-- query for number of na
na.cars <- cars.DT[,lapply(.SD,function(o){length(which(is.na(o)))}),.SDcols= 1:nCol ]
##-- create new data.table
count.DT <- data.table(attri=names(na.cars),num.na=as.numeric(na.cars) )
count.DT[num.na>0]
## attri num.na
## 1: Rear.seat.room 2
## 2: Luggage.room 11
na
comes from two attributes, i.e.,
Rear.seat.room
and Luggage.room
. Therefore, we
need to show details by record/row and by Type. cars.DT[is.na(Rear.seat.room)][,.(Model,Rear.seat.room,Luggage.room)]
## Model Rear.seat.room Luggage.room
## 1: Corvette NA NA
## 2: RX-7 NA NA
cars.DT[is.na(Luggage.room)][,.(Model,Rear.seat.room,Luggage.room)]
## Model Rear.seat.room Luggage.room
## 1: Lumina_APV 30.5 NA
## 2: Astro 33.5 NA
## 3: Corvette NA NA
## 4: Caravan 26.5 NA
## 5: Aerostar 30.0 NA
## 6: MPV 27.5 NA
## 7: RX-7 NA NA
## 8: Quest 27.0 NA
## 9: Silhouette 30.5 NA
## 10: Previa 35.0 NA
## 11: Eurovan 34.0 NA
cars.DT[is.na(Rear.seat.room) | is.na(Luggage.room) ,.N,by=Type]
## Type N
## 1: Van 9
## 2: Sporty 2
##-- replace with 0
cars.DT[is.na(Rear.seat.room),Rear.seat.room:=0]
cars.DT[is.na(Luggage.room),Luggage.room:=0]
data.frame
)is.na()
to the whole
data.table and find row/col of na
. na.mat <- which(is.na(cars.DT),arr.ind = T)
na.row <- sort(unique(na.mat[,1]))
na.col <- sort(c(1,unique(na.mat[,2])))
##-- call data.table by without its property | data.table becomes data.frame
cars.DT[na.row,na.col,with=F]
## Manufacturer Rear.seat.room Luggage.room
## 1: Chevrolet 30.5 NA
## 2: Chevrolet 33.5 NA
## 3: Chevrolet NA NA
## 4: Dodge 26.5 NA
## 5: Ford 30.0 NA
## 6: Mazda 27.5 NA
## 7: Mazda NA NA
## 8: Nissan 27.0 NA
## 9: Oldsmobile 30.5 NA
## 10: Toyota 35.0 NA
## 11: Volkswagen 34.0 NA
##-- replace with 0 // loop with set
for(i in 1:nrow(na.mat)){
## i <- 1
set(cars.DT,i = na.mat[i,1],j=na.mat[i,2],0)
}
summarytools
package is another trick that you should
no forget print(summarytools::dfSummary(cars.DT),method="render")
No | Variable | Stats / Values | Freqs (% of Valid) | Graph | Valid | Missing | |||||||||||||||||||||||||||||||||||||||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | Manufacturer [factor] |
|
|
93 (100.0%) | 0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
2 | Model [factor] |
|
|
93 (100.0%) | 0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
3 | Type [factor] |
|
|
93 (100.0%) | 0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
4 | Min.Price [numeric] |
|
79 distinct values | 93 (100.0%) | 0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
5 | Price [numeric] |
|
81 distinct values | 93 (100.0%) | 0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
6 | Max.Price [numeric] |
|
79 distinct values | 93 (100.0%) | 0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
7 | MPG.city [integer] |
|
21 distinct values | 93 (100.0%) | 0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
8 | MPG.highway [integer] |
|
22 distinct values | 93 (100.0%) | 0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
9 | AirBags [factor] |
|
|
93 (100.0%) | 0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
10 | DriveTrain [factor] |
|
|
93 (100.0%) | 0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
11 | Cylinders [factor] |
|
|
93 (100.0%) | 0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
12 | EngineSize [numeric] |
|
26 distinct values | 93 (100.0%) | 0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
13 | Horsepower [integer] |
|
57 distinct values | 93 (100.0%) | 0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
14 | RPM [integer] |
|
24 distinct values | 93 (100.0%) | 0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
15 | Rev.per.mile [integer] |
|
78 distinct values | 93 (100.0%) | 0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
16 | Man.trans.avail [factor] |
|
|
93 (100.0%) | 0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
17 | Fuel.tank.capacity [numeric] |
|
38 distinct values | 93 (100.0%) | 0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
18 | Passengers [integer] |
|
|
93 (100.0%) | 0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
19 | Length [integer] |
|
51 distinct values | 93 (100.0%) | 0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
20 | Wheelbase [integer] |
|
27 distinct values | 93 (100.0%) | 0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
21 | Width [integer] |
|
16 distinct values | 93 (100.0%) | 0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
22 | Turn.circle [integer] |
|
14 distinct values | 93 (100.0%) | 0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
23 | Rear.seat.room [numeric] |
|
25 distinct values | 93 (100.0%) | 0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
24 | Luggage.room [integer] |
|
17 distinct values | 93 (100.0%) | 0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
25 | Weight [integer] |
|
81 distinct values | 93 (100.0%) | 0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
26 | Origin [factor] |
|
|
93 (100.0%) | 0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
27 | Make [factor] |
|
|
93 (100.0%) | 0 (0.0%) |
Generated by summarytools 1.0.1 (R version 4.2.3)
2023-10-23
pairs()
and compare with Origin. However, there are 18
numerical attributes; therefore, we plot only 6 at a time. ##-- get index of numerical attribute
nCol <- ncol(cars.DT)
num.idx <- which(as.logical(cars.DT[,lapply(.SD,is.numeric),.SDcols=1:nCol]))
pairs(cars.DT[,num.idx[1:6],with=F],col=cars.DT$Origin)
pairs(cars.DT[,num.idx[7:12],with=F],col=cars.DT$Origin)
pairs(cars.DT[,num.idx[13:18],with=F],col=cars.DT$Origin)
##-- report mean
cars.DT[,lapply(.SD,mean,na.rm=T),.SDcols=num.idx,by=Origin ]
## Origin Min.Price Price Max.Price MPG.city MPG.highway EngineSize
## 1: non-USA 17.75556 20.50889 23.25556 23.86667 30.08889 2.242222
## 2: USA 16.53542 18.57292 20.62708 20.95833 28.14583 3.066667
## Horsepower RPM Rev.per.mile Fuel.tank.capacity Passengers Length
## 1: 139.8889 5590.000 2559.889 16.25111 4.822222 177.7556
## 2: 147.5208 4990.625 2118.750 17.05208 5.333333 188.3125
## Wheelbase Width Turn.circle Rear.seat.room Luggage.room Weight
## 1: 102.0444 67.68889 37.33333 26.90000 11.42222 2942.333
## 2: 105.7292 70.95833 40.47917 27.54167 13.02083 3195.312
geom_violin()
and
melt()
in ggplot2
and data.table
packages to see distribution of normalize attributes require(data.table)
require(ggplot2)
require(dplyr)
cars.num.DT <- cars.DT[,num.idx,with=F]
cars.name.num <- colnames(cars.num.DT)
cars.num.DT <- as.data.table(apply(cars.num.DT,2,scale))
cars.num.DT[,id:=1:.N]
cars.num.DT[,Origin:=cars.DT$Origin]
cars.num.long <- melt.data.table(cars.num.DT,id.vars=c("id","Origin"))
cars.num.long[variable%in%cars.name.num[13:18]] %>% ggplot(aes(y=value,x=Origin,fill=Origin) ) +
geom_violin() + facet_grid(~variable)
cor()
and viusalize with
corrplot()
and ordering by require(corrplot)
cars.num.DT <- cars.DT[,num.idx,with=F]
cars.cor <- cor(as.matrix(cars.num.DT))
corrplot(cars.cor,method = "ellipse",type="upper",order="AOE",tl.col="brown")
##-- get index of factor attributes
fac.idx <- which(as.logical(cars.DT[,lapply(.SD,is.factor),.SDcols=1:nCol]))
##-- report factor of highest freq
temp.DT <- cars.DT[,lapply(.SD,function(o){ names(sort(table(o),T))[1] }),.SDcols=fac.idx,by=Origin ]
##-- report highest freq
temp.DT <- rbind(temp.DT,cars.DT[,lapply(.SD,function(o){ sort(table(o),T)[1] }),.SDcols=fac.idx,by=Origin ] )
##-- combine
temp.DT[order(Origin)]
## Origin Manufacturer Model Type AirBags DriveTrain Cylinders
## 1: USA Chevrolet Achieva Large Driver only Front 4
## 2: USA 8 1 11 23 34 22
## 3: non-USA Mazda 100 Small Driver only Front 4
## 4: non-USA 5 1 14 20 33 27
## Man.trans.avail Origin Make
## 1: No USA Buick Century
## 2: 26 48 1
## 3: Yes non-USA Acura Integra
## 4: 39 45 1
ftable()
frequency cars.fac.DT <- cars.DT[,fac.idx,with=F]
cars.name.fac <- colnames(cars.fac.DT)
nList <- length(cars.name.fac)
cross.table <- list()
for(i in 1:nList){
cross.table[[i]] <- ftable(cars.fac.DT[[i]],cars.fac.DT$Origin)
}
names(cross.table) <- cars.name.fac
cross.table$Origin <- NULL
cross.table$Type
## USA non-USA
##
## Compact 7 9
## Large 11 0
## Midsize 10 12
## Small 7 14
## Sporty 8 6
## Van 5 4
cars.DT[,.(min.price=mean(Min.Price),avg.price=mean(Price),max.price=mean(Max.Price)),by=.(Type,Origin)][order(Type)]
## Type Origin min.price avg.price max.price
## 1: Compact non-USA 19.111111 22.40000 25.67778
## 2: Compact USA 11.300000 12.82857 14.35714
## 3: Large USA 22.936364 24.30000 25.67273
## 4: Midsize non-USA 27.575000 31.75000 35.90000
## 5: Midsize USA 19.960000 21.78000 23.61000
## 6: Small non-USA 8.578571 10.22857 11.86429
## 7: Small USA 8.128571 10.04286 11.98571
## 8: Sporty USA 16.100000 19.37500 22.67500
## 9: Sporty non-USA 17.866667 19.41667 21.00000
## 10: Van USA 15.400000 18.26000 21.16000
## 11: Van non-USA 17.200000 20.15000 23.12500
cars.DT[,.(min.size=min(EngineSize),avg.size=mean(EngineSize),max.size=max(EngineSize)),by=.(Type,Origin)][order(Type)]
## Type Origin min.size avg.size max.size
## 1: Compact non-USA 2.0 2.311111 2.8
## 2: Compact USA 2.0 2.357143 3.0
## 3: Large USA 3.3 4.209091 5.7
## 4: Midsize non-USA 2.0 2.983333 4.5
## 5: Midsize USA 2.2 3.150000 4.6
## 6: Small non-USA 1.0 1.550000 1.8
## 7: Small USA 1.3 1.685714 2.2
## 8: Sporty USA 1.6 2.900000 5.7
## 9: Sporty non-USA 1.3 1.950000 2.8
## 10: Van USA 3.0 3.580000 4.3
## 11: Van non-USA 2.4 2.725000 3.0
cars.DT[,.(avg.wt=mean(Weight),avg.length=mean(Length),avg.width=mean(Width)),
by=.(Type,Origin)][order(Type)]
## Type Origin avg.wt avg.length avg.width
## 1: Compact non-USA 3020.556 182.0000 67.22222
## 2: Compact USA 2786.429 182.2857 67.28571
## 3: Large USA 3695.455 204.8182 74.72727
## 4: Midsize non-USA 3437.083 189.5000 69.91667
## 5: Midsize USA 3355.500 196.2000 71.50000
## 6: Small non-USA 2293.929 166.1429 64.85714
## 7: Small USA 2350.714 169.2857 66.14286
## 8: Sporty USA 3039.375 180.7500 70.62500
## 9: Sporty non-USA 2713.333 167.8333 67.50000
## 10: Van USA 3779.000 183.4000 74.00000
## 11: Van non-USA 3895.000 188.5000 72.25000
geom_violin()
MPG.city
and
MPG.highway
cars.DT[,.(min=min(MPG.city),avg=mean(MPG.city),max=max(MPG.city),sd=sd(MPG.city)),by=.(Type,Origin)][order(Type)]
## Type Origin min avg max sd
## 1: Compact non-USA 20 22.11111 26 2.2047928
## 2: Compact USA 22 23.42857 25 1.2724180
## 3: Large USA 16 18.36364 20 1.5015144
## 4: Midsize non-USA 17 19.33333 22 1.6696942
## 5: Midsize USA 16 19.80000 23 2.2010099
## 6: Small non-USA 22 30.92857 46 6.9555101
## 7: Small USA 23 27.71429 31 3.4016803
## 8: Sporty USA 17 20.62500 24 2.6692696
## 9: Sporty non-USA 17 23.33333 30 4.9665548
## 10: Van USA 15 16.60000 18 1.5165751
## 11: Van non-USA 17 17.50000 18 0.5773503
cars.DT[,.(min=min(MPG.highway),avg=mean(MPG.highway),max=max(MPG.highway),sd=sd(MPG.highway)),by=.(Type,Origin)][order(Type)]
## Type Origin min avg max sd
## 1: Compact non-USA 26 29.33333 34 2.500000
## 2: Compact USA 27 30.57143 36 3.505098
## 3: Large USA 25 26.72727 28 1.272078
## 4: Midsize non-USA 22 25.75000 30 2.416797
## 5: Midsize USA 25 27.90000 31 2.183270
## 6: Small non-USA 29 36.28571 50 6.157279
## 7: Small USA 29 33.85714 41 4.259443
## 8: Sporty USA 24 27.50000 30 2.267787
## 9: Sporty non-USA 25 30.50000 36 4.593474
## 10: Van USA 20 21.40000 23 1.516575
## 11: Van non-USA 21 22.50000 24 1.290994
t.test()
cars.xtabs <- xtabs(~Origin+Type,data=cars.DT[,.N,by=.(Type,Origin)])
cars.type <- names(which(colSums(cars.xtabs) == 2))
pVal.city <- rep(-1,length(cars.type))
pVal.high <- rep(-1,length(cars.type))
names(pVal.city) <- cars.type
for( i in 1:length(cars.type)){
text <- cars.type[i]
a <- t.test(cars.DT[Origin=="USA" & Type==text]$MPG.city,cars.DT[Origin=="non-USA" & Type==text]$MPG.city)
pVal.city[i] <- a$p.value
b <- t.test(cars.DT[Origin=="USA" & Type==text]$MPG.highway,cars.DT[Origin=="non-USA" & Type==text]$MPG.highway)
pVal.high[i] <- b$p.value
}
rbind(pVal.city,pVal.high)
## Compact Midsize Small Sporty Van
## pVal.city 0.1573010 0.58884034 0.1712123 0.2643436 0.2731866
## pVal.high 0.4464744 0.04060965 0.3065013 0.1858016 0.2788020
anova()
##-- what does this anova test mean
anova(lm(MPG.city~Type*Origin,data=cars.DT))
anova(lm(MPG.highway~Type*Origin,data=cars.DT))
cars.DT[,MPG.diff:= MPG.highway-MPG.city]
cars.DT[
,.(cv.city=sd(MPG.city)/mean(MPG.city),cv.high=sd(MPG.highway)/mean(MPG.highway),gap=mean(MPG.diff) )
, by=.(Type,Origin) ] -> mpg.DT
xtabs(cv.high~Type+Origin,data=mpg.DT)
## Origin
## Type USA non-USA
## Compact 0.11465275 0.08522727
## Large 0.04759475 0.00000000
## Midsize 0.07825339 0.09385621
## Small 0.12580634 0.16968880
## Sporty 0.08246498 0.15060569
## Van 0.07086799 0.05737753
xtabs(gap~Type+Origin,data=mpg.DT)
## Origin
## Type USA non-USA
## Compact 7.142857 7.222222
## Large 8.363636 0.000000
## Midsize 8.100000 6.416667
## Small 6.142857 5.357143
## Sporty 6.875000 7.166667
## Van 4.800000 5.000000
Origin
to gain insights.
(hint: Please use the whole dataset as training
dataset as this dataset is really bad for classification. why?
) cars.DT[,Manufacturer:=NULL]
require(party)
plot(ctree(Origin~.,data=cars.DT))
require(rpart)
require(rpart.plot)
cars.rpart <- rpart(Origin~.,data=cars.DT)
prp(cars.rpart,extra=104,type=1)
data.table
package and convert data
into appropreate format require(data.table)
cheat.DT <- as.data.table(fread("cheatEx.csv"))
head(cheat.DT)
## tID refund marital income cheat
## 1: 1 Yes Single 125 No
## 2: 2 No Married 100 No
## 3: 3 No Single 70 No
## 4: 4 Yes Married 120 No
## 5: 5 No Divorced 95 Yes
## 6: 6 No Married 60 No
Explanation
cheat.DT[,refund :=as.factor(refund)]
cheat.DT[,marital:=as.factor(marital)]
cheat.DT[,cheat :=factor(cheat,labels = c("No","Yes"))]
str(cheat.DT)
## Classes 'data.table' and 'data.frame': 10 obs. of 5 variables:
## $ tID : int 1 2 3 4 5 6 7 8 9 10
## $ refund : Factor w/ 2 levels "No","Yes": 2 1 1 2 1 1 2 1 1 1
## $ marital: Factor w/ 3 levels "Divorced","Married",..: 3 2 3 2 1 2 1 3 2 3
## $ income : int 125 100 70 120 95 60 220 85 75 90
## $ cheat : Factor w/ 2 levels "No","Yes": 1 1 1 1 2 1 1 2 1 2
## - attr(*, ".internal.selfref")=<externalptr>
cheat.DT[,bas.income:= cut(income,c(50,80,230),labels=c("Low","Hig"))]
require(ggplot2)
require(dplyr)
cheat.DT[,.N,by=.(marital,cheat,refund,bas.income)] %>% ggplot(aes(x=marital,y=cheat,color=refund,size=factor(N) )) +
facet_grid(cols=vars(bas.income) ) + geom_count(alpha=0.6)
Quesion: What do you observe
cheat0.DT <- cheat.DT
dcast(cheat0.DT[,.N,by=.(cheat,refund)],cheat~refund)
## cheat No Yes
## 1: No 4 3
## 2: Yes 3 NA
dcast(cheat0.DT[,.N,by=.(cheat,marital)],cheat~marital)
## cheat Divorced Married Single
## 1: No 1 4 2
## 2: Yes 1 NA 2
dcast(cheat0.DT[,.N,by=.(cheat,bas.income)],cheat~bas.income)
## cheat Low Hig
## 1: No 3 4
## 2: Yes NA 3
cheat1.DT <- cheat0.DT[marital!="Married"]
dcast(cheat1.DT[,.N,by=.(cheat,refund)],cheat~refund)
## cheat No Yes
## 1: No 1 2
## 2: Yes 3 NA
dcast(cheat1.DT[,.N,by=.(cheat,bas.income)],cheat~bas.income)
## cheat Low Hig
## 1: No 1 2
## 2: Yes NA 3
cheat2.DT <- cheat1.DT[refund!="Yes"]
dcast(cheat2.DT[,.N,by=.(cheat,marital)],cheat~marital)
## cheat Divorced Single
## 1: No NA 1
## 2: Yes 1 2
dcast(cheat2.DT[,.N,by=.(cheat,bas.income)],cheat~bas.income)
## cheat Low Hig
## 1: No 1 NA
## 2: Yes NA 3
FSelectorRcpp
require(FSelectorRcpp)
information_gain(cheat~.,data=cheat0.DT)
## attributes importance
## 1 tID 0.0000000
## 2 refund 0.1328286
## 3 marital 0.1949760
## 4 income 0.0000000
## 5 bas.income 0.1328286
information_gain(cheat~.,data=cheat1.DT)
## attributes importance
## 1 tID 0.0000000
## 2 refund 0.3182571
## 3 marital 0.0000000
## 4 income 0.0000000
## 5 bas.income 0.1323041
information_gain(cheat~.,data=cheat2.DT)
## attributes importance
## 1 tID 0.56233514
## 2 refund 0.00000000
## 3 marital 0.08494952
## 4 income 0.56233514
## 5 bas.income 0.56233514
openxlsx
is required to import. require(data.table)
require(openxlsx)
salesYr.DT <- as.data.table(read.xlsx("pg_TiHi.xlsx", sheet=1))
oldInfo.DT <- as.data.table(read.xlsx("pg_TiHi.xlsx", sheet="cur"))
pattern.DT <- as.data.table(read.xlsx("pg_TiHi.xlsx", sheet=3))
##-- set primary key
setkey(salesYr.DT,"skuID")
setkey(oldInfo.DT,"skuID")
setkey(pattern.DT,"patternID")
NA
?NA
. salesYr.DT[,lapply(.SD,function(o){sum(is.na(o))} ) ,.SDcols=1:ncol(salesYr.DT)]
## skuID sale.Y0 sale.Y1
## 1: 0 0 0
which(is.na(oldInfo.DT),arr.ind = T)
## row col
pattern.DT[,lapply(.SD,function(o){sum(is.na(o))} ) ,.SDcols=1:ncol(pattern.DT)]
## car.dim.ID patternID #SKU car.length car.width car.height car.layer.new
## 1: 0 0 0 0 0 0 0
## nlayer.new TiHi.new
## 1: 0 0
NA
data.table
or apply duplicated()
any(duplicated(pattern.DT))
require(stringr)
##-- salesYr.DT (NO)
any(duplicated(salesYr.DT[,2:ncol(salesYr.DT),with=F]))
## [1] FALSE
##-- oldInfo.DT ()
any(duplicated(oldInfo.DT[,2:ncol(oldInfo.DT),with=F]))
## [1] TRUE
###-- Yes-- which one? how to solve
colnames(oldInfo.DT)
## [1] "skuID" "car.layer.cur" "nLayer.cur" "TiHi.cur"
## [5] "car.length" "car.width" "car.height" "gross.wt"
## [9] "net.weight" "plt.length" "plt.width"
oldInfo.DT[,.N,by=car.layer.cur:plt.width][N>1]
## car.layer.cur nLayer.cur TiHi.cur car.length car.width car.height gross.wt
## 1: 10 4 40 395 250 220 10.260
## 2: 10 4 40 395 250 220 10.660
## 3: 10 4 40 395 250 220 9.950
## 4: 12 5 60 360 230 170 4.972
## 5: 10 4 40 395 250 220 8.031
## 6: 10 4 40 395 250 220 8.119
## 7: 10 4 40 395 250 220 8.130
## 8: 10 4 40 395 250 220 8.259
## 9: 10 4 40 395 250 220 8.359
## 10: 10 4 40 395 250 220 8.390
## 11: 10 4 40 395 250 220 9.590
## 12: 12 4 48 345 245 235 7.233
## 13: 11 3 33 354 258 239 2.636
## 14: 11 3 33 354 258 239 4.353
## 15: 10 6 60 387 298 133 2.904
## 16: 16 3 48 285 218 241 9.996
## 17: 12 3 36 345 245 235 6.646
## 18: 12 3 36 345 245 235 6.655
## 19: 12 3 36 345 245 235 7.173
## 20: 12 3 36 345 245 235 7.233
## 21: 12 3 36 345 245 235 7.463
## 22: 12 3 36 345 245 235 8.307
## car.layer.cur nLayer.cur TiHi.cur car.length car.width car.height gross.wt
## net.weight plt.length plt.width N
## 1: 9.192 1200 1000 2
## 2: 9.816 1200 1000 2
## 3: 9.120 1200 1000 7
## 4: 4.355 1200 1000 4
## 5: 7.272 1200 1000 3
## 6: 7.272 1200 1000 4
## 7: 7.272 1200 1000 2
## 8: 7.500 1200 1000 4
## 9: 7.512 1200 1000 2
## 10: 7.512 1200 1000 4
## 11: 8.712 1200 1000 2
## 12: 6.535 1200 1000 2
## 13: 2.161 1200 1000 4
## 14: 3.951 1200 1000 3
## 15: 2.223 1200 1000 4
## 16: 9.504 1200 1000 3
## 17: 5.920 1200 1000 2
## 18: 5.920 1200 1000 2
## 19: 6.535 1200 1000 2
## 20: 6.535 1200 1000 9
## 21: 6.765 1200 1000 3
## 22: 7.609 1200 1000 2
## net.weight plt.length plt.width N
oldInfo.DT[,.N,by=car.layer.cur:car.height][N>1]
## car.layer.cur nLayer.cur TiHi.cur car.length car.width car.height N
## 1: 10 4 40 395 250 220 39
## 2: 12 5 60 360 230 170 4
## 3: 12 4 48 345 245 235 4
## 4: 11 3 33 354 258 239 7
## 5: 10 4 40 435 233 190 3
## 6: 10 6 60 387 298 133 4
## 7: 16 3 48 285 218 241 4
## 8: 12 3 36 345 245 235 24
## 9: 16 3 48 286 218 241 2
oldInfo.DT[,carID:=str_c(car.length,"-",car.width,"-",car.height)]
##-- pattern.DT ()
any(duplicated(pattern.DT[,-2,with=F]))
## [1] FALSE
###-- Yes-- which one?
pattern.DT[,.N,by=.(car.length,car.width,car.height)][N>1]
## car.length car.width car.height N
## 1: 345 245 235 2
## 2: 285 218 241 2
## 3: 404 244 238 2
## 4: 334 254 193 2
pattern.DT[str_sub(car.dim.ID,7,8)!=""]
## car.dim.ID patternID #SKU car.length car.width car.height car.layer.new
## 1: CD-003A CD03 24 345 245 235 13
## 2: CD-003B CD04 4 345 245 235 13
## 3: CD-004A CD05 4 285 218 241 17
## 4: CD-004B CD06 1 285 218 241 17
## 5: CD-006A CD08 1 404 244 238 10
## 6: CD-006B CD09 7 404 244 238 10
## 7: CD-008A CD11 1 334 254 193 12
## 8: CD-008B CD12 3 334 254 193 12
## nlayer.new TiHi.new
## 1: 4 52
## 2: 4 52
## 3: 4 68
## 4: 4 68
## 5: 4 40
## 6: 4 40
## 7: 5 60
## 8: 5 60
## skuID car.layer.cur nLayer.cur carID
## 1: BR01-001 10 4 395-250-220
## 2: BR01-002 10 4 395-250-220
## 3: BR01-003 10 4 395-250-220
## 4: BR01-004 10 4 395-250-220
## 5: BR01-005 10 4 395-250-220
## 6: BR01-006 10 4 395-250-220
## 7: BR01-007 10 4 395-250-220
## 8: BR01-008 10 4 395-250-220
## 9: BR01-009 10 4 395-250-220
## 10: BR01-010 10 4 395-250-220
## 11: BR01-011 10 4 395-250-220
## 12: BR01-012 12 5 360-230-170
## 13: BR01-013 12 5 360-230-170
## 14: BR01-014 12 5 360-230-170
## 15: BR01-015 12 5 360-230-170
## 16: BR01-016 10 4 395-250-220
## 17: BR01-017 10 4 395-250-220
## 18: BR01-018 10 4 395-250-220
## 19: BR01-019 10 4 395-250-220
## 20: BR01-020 10 4 395-250-220
## 21: BR01-021 10 4 395-250-220
## 22: BR01-022 10 4 395-250-220
## 23: BR01-023 10 4 395-250-220
## 24: BR01-024 10 4 395-250-220
## 25: BR01-025 10 4 395-250-220
## 26: BR01-026 10 4 395-250-220
## 27: BR01-027 10 4 395-250-220
## 28: BR01-028 10 4 395-250-220
## 29: BR01-029 10 4 395-250-220
## 30: BR01-030 10 4 395-250-220
## 31: BR01-031 10 4 395-250-220
## 32: BR01-032 10 4 395-250-220
## 33: BR01-033 10 4 395-250-220
## 34: BR01-034 10 4 395-250-220
## 35: BR01-035 10 4 395-250-220
## 36: BR01-036 10 4 395-250-220
## 37: BR01-037 10 4 395-250-220
## 38: BR01-038 10 4 395-250-220
## 39: BR01-039 10 4 395-250-220
## 40: BR01-040 10 4 395-250-220
## 41: BR01-041 10 4 395-250-220
## 42: BR01-042 10 4 395-250-220
## 43: BR01-043 10 4 395-250-220
## 44: BR01-044 12 4 345-245-235
## 45: BR01-045 12 4 345-245-235
## 46: BR01-046 12 4 345-245-235
## 47: BR01-047 12 4 345-245-235
## 48: BR01-048 16 4 285-218-241
## 49: BR01-049 11 3 354-258-239
## 50: BR01-050 11 3 354-258-239
## 51: BR01-051 11 3 354-258-239
## 52: BR01-052 11 3 354-258-239
## 53: BR01-053 11 3 354-258-239
## 54: BR01-054 11 3 354-258-239
## 55: BR01-055 11 3 354-258-239
## 56: BR01-056 10 3 404-244-238
## 57: BR01-057 10 4 435-233-190
## 58: BR01-058 12 4 334-254-193
## 59: BR01-059 10 4 435-233-190
## 60: BR01-060 10 4 435-233-190
## 61: BR01-061 10 6 387-298-133
## 62: BR01-062 10 6 387-298-133
## 63: BR01-063 10 6 387-298-133
## 64: BR01-064 10 6 387-298-133
## 65: BR01-065 16 3 285-218-241
## 66: BR01-066 16 3 285-218-241
## 67: BR01-067 16 3 285-218-241
## 68: BR01-068 12 3 345-245-235
## 69: BR01-069 12 3 345-245-235
## 70: BR01-070 12 3 345-245-235
## 71: BR01-071 12 3 345-245-235
## 72: BR01-072 12 3 345-245-235
## 73: BR01-073 12 3 345-245-235
## 74: BR01-074 12 3 345-245-235
## 75: BR01-075 12 3 345-245-235
## 76: BR01-076 12 3 345-245-235
## 77: BR01-077 12 3 345-245-235
## 78: BR01-078 12 3 345-245-235
## 79: BR01-079 12 3 345-245-235
## 80: BR01-080 12 3 345-245-235
## 81: BR01-081 12 3 345-245-235
## 82: BR01-082 12 3 345-245-235
## 83: BR01-083 12 3 345-245-235
## 84: BR01-084 12 3 345-245-235
## 85: BR01-085 12 3 345-245-235
## 86: BR01-086 12 3 345-245-235
## 87: BR01-087 12 3 345-245-235
## 88: BR01-088 12 3 345-245-235
## 89: BR01-089 12 3 345-245-235
## 90: BR01-090 12 3 345-245-235
## 91: BR01-091 12 3 345-245-235
## 92: BR01-092 16 3 285-218-241
## 93: BR01-093 16 3 286-218-241
## 94: BR01-094 16 3 286-218-241
## skuID car.layer.cur nLayer.cur carID
## skuID sale.Y0 sale.Y1
## 1: BR01-001 79189 29980
## 2: BR01-002 91138 44692
## 3: BR01-003 33676 9134
## 4: BR01-004 49014 14698
## 5: BR01-005 11091 6251
## 6: BR01-006 9573 2602
## skuID car.layer.cur nLayer.cur carID
## 1: BR01-001 10 4 395-250-220
## 2: BR01-002 10 4 395-250-220
## 3: BR01-003 10 4 395-250-220
## 4: BR01-004 10 4 395-250-220
## 5: BR01-005 10 4 395-250-220
## 6: BR01-006 10 4 395-250-220
## car.dim.ID car.length car.width car.height car.layer.new nlayer.new
## 1: CD-001 395 250 220 12 4
## 2: CD-002 360 230 170 13 5
## 3: CD-003A 345 245 235 13 4
## 4: CD-003B 345 245 235 13 4
## 5: CD-004A 285 218 241 17 4
## 6: CD-004B 285 218 241 17 4
## carID
## 1: 395-250-220
## 2: 360-230-170
## 3: 345-245-235
## 4: 345-245-235
## 5: 285-218-241
## 6: 285-218-241
pgTiHi.DT
. Here are the result pgTiHi.DT <- salesYr.DT[oldInfo.DT,on="skuID"]
pgTiHi.DT <- pgTiHi.DT[pattern.DT,on="carID",allow.cartesian=TRUE][!is.na(skuID)]
head(pgTiHi.DT)
## skuID sale.Y0 sale.Y1 car.layer.cur nLayer.cur carID car.dim.ID
## 1: BR01-001 79189 29980 10 4 395-250-220 CD-001
## 2: BR01-002 91138 44692 10 4 395-250-220 CD-001
## 3: BR01-003 33676 9134 10 4 395-250-220 CD-001
## 4: BR01-004 49014 14698 10 4 395-250-220 CD-001
## 5: BR01-005 11091 6251 10 4 395-250-220 CD-001
## 6: BR01-006 9573 2602 10 4 395-250-220 CD-001
## car.length car.width car.height car.layer.new nlayer.new
## 1: 395 250 220 12 4
## 2: 395 250 220 12 4
## 3: 395 250 220 12 4
## 4: 395 250 220 12 4
## 5: 395 250 220 12 4
## 6: 395 250 220 12 4
pgTiHi.DT[,nPLT.Y0.cur:= sale.Y0/(car.layer.cur*nLayer.cur)]
pgTiHi.DT[,nPLT.Y1.cur:= sale.Y1/(car.layer.cur*nLayer.cur)]
pgTiHi.DT[,nPLT.Y0.new:= sale.Y0/(car.layer.new*nlayer.new)]
pgTiHi.DT[,nPLT.Y1.new:= sale.Y1/(car.layer.new*nlayer.new)]
pgTiHi.DT[,nPLT.Y0.diff:= nPLT.Y0.cur-nPLT.Y0.new]
pgTiHi.DT[,nPLT.Y1.diff:= nPLT.Y1.cur-nPLT.Y1.new]
pgTiHi.DT[,lapply(.SD,sum),.SDcols=c("nPLT.Y0.diff","nPLT.Y1.diff")]
## nPLT.Y0.diff nPLT.Y1.diff
## 1: 26885.76 12185.21
pgTiHi.DT[,colnames(pgTiHi.DT) %like% ".Y",with=F]
## sale.Y0 sale.Y1 nPLT.Y0.cur nPLT.Y1.cur nPLT.Y0.new nPLT.Y1.new
## 1: 79189 29980 1979.72500 749.5000 1649.77083 624.5833
## 2: 91138 44692 2278.45000 1117.3000 1898.70833 931.0833
## 3: 33676 9134 841.90000 228.3500 701.58333 190.2917
## 4: 49014 14698 1225.35000 367.4500 1021.12500 306.2083
## 5: 11091 6251 277.27500 156.2750 231.06250 130.2292
## ---
## 125: 2592 13688 43.20000 228.1333 37.02857 195.5429
## 126: 4727 15187 78.78333 253.1167 67.52857 216.9571
## 127: 7801 15689 130.01667 261.4833 111.44286 224.1286
## 128: 31051 18725 646.89583 390.1042 456.63235 275.3676
## 129: 43251 14745 901.06250 307.1875 636.04412 216.8382
## nPLT.Y0.diff nPLT.Y1.diff
## 1: 329.954167 124.91667
## 2: 379.741667 186.21667
## 3: 140.316667 38.05833
## 4: 204.225000 61.24167
## 5: 46.212500 26.04583
## ---
## 125: 6.171429 32.59048
## 126: 11.254762 36.15952
## 127: 18.573810 37.35476
## 128: 190.263480 114.73652
## 129: 265.018382 90.34926
temp.DT <- pgTiHi.DT[,.(car.layer.diff=car.layer.new - car.layer.cur,nLayer.diff=nlayer.new-nLayer.cur),by=.(carID,skuID)]
temp.DT[,.N,by=car.layer.diff][order(car.layer.diff)]
## car.layer.diff N
## 1: 0 18
## 2: 1 72
## 3: 2 39
temp.DT[,.N,by=nLayer.diff][order(nLayer.diff)]
## nLayer.diff N
## 1: 0 53
## 2: 1 76
xtabs(~nLayer.diff+car.layer.diff,data=temp.DT)
## car.layer.diff
## nLayer.diff 0 1 2
## 0 0 14 39
## 1 18 58 0
require(ggplot2)
require(dplyr)
require(stringr)
pgTiHi.DT[,lapply(.SD,unique),.SDcols=c('car.layer.new','nlayer.new','car.layer.cur','nLayer.cur'),by=carID ] -> patTiHi.DT
setnames(patTiHi.DT,"nlayer.new","nLayer.new")
patTiHi.DT <- melt(patTiHi.DT,id.vars = "carID")
patTiHi.DT[,length:= str_sub(carID,1,3)]
patTiHi.DT[,width := str_sub(carID,5,7)]
patTiHi.DT[,len.we:= str_sub(carID,1,7)]
patTiHi.DT[,height:= str_sub(carID,9,11)]
patTiHi.DT[,area := as.factor(round(as.numeric(length)*as.numeric(height)/10000,2))]
patTiHi.DT[variable %like% "nLayer"] %>% ggplot(aes(y=value,x=height,fill=variable)) + geom_boxplot()
patTiHi.DT[variable %like% "car.layer"] %>% ggplot(aes(y=value,x=len.we,fill=variable)) + geom_boxplot()
patTiHi.DT[variable %like% "car.layer"] %>% ggplot(aes(y=value,x=area,fill=variable)) + geom_boxplot()
require(party)
pgTiHi.class <- pgTiHi.DT[,.(nlayer.new,car.layer.new,car.length,car.width,car.height)]
pgTiHi.class[,nlayer.new:= as.factor(nlayer.new)]
pgTiHi.nLayer.ctree <- ctree(nlayer.new~., data=pgTiHi.class)
plot(pgTiHi.nLayer.ctree)
pgTiHi.class[,car.layer.new:= as.factor(car.layer.new)]
pgTiHi.carLayer.ctree <- ctree(car.layer.new~car.length, data=pgTiHi.class)
plot(pgTiHi.carLayer.ctree)
Copyright 2019 Oran Kittithreerapronchai. All Rights Reserved. Last modified: 2023-45-23,