-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathChips.Rmd
More file actions
136 lines (105 loc) · 3.04 KB
/
Chips.Rmd
File metadata and controls
136 lines (105 loc) · 3.04 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
---
title: "chips"
output:
word_document: default
html_document:
df_print: paged
---
```{r}
library("readxl")
df <- read_excel("FuentesChips.xlsx")
```
```{r}
#Getting the preview of the dataset
head(df)
```
```{r}
#Getting the structure of the dataset
str(df)
```
```{r}
#Descriptive Statistics
summary(df)
str(df)
```
```{r}
#Data visualisations
#Penetration per region
#New England
library(ggplot2)
ggplot(df) + aes(x=`New England`) + geom_bar()
```
```{r}
#Mid Atlantic
ggplot(df) + aes(x=`Mid-Atlantic`) + geom_bar()
```
```{r}
#Mid west
ggplot(df) + aes(x=`Midwest`) + geom_bar()
```
```{r}
#Great Plains
ggplot(df) + aes(x=`Great Plains`) + geom_bar()
```
```{r}
#South Atlantic
ggplot(df) + aes(x=`South Atlantic`) + geom_bar()
```
```{r}
#Deep South
ggplot(df) + aes(x=`Deep South`) + geom_bar()
```
```{r}
#Mountain
ggplot(df) + aes(x=`Mountain`) + geom_bar()
```
```{r}
#Pacific
ggplot(df) + aes(x=`Pacific`) + geom_bar()
```
```{r}
#label encoding assigns each categorical value an integer value based on alphabetical order this is so that can be readily used by algorithms.
df$`New England` <- as.numeric(factor(df$`New England`))
df$`Mid-Atlantic` <- as.numeric(factor(df$`Mid-Atlantic`))
df$`Midwest` <- as.numeric(factor(df$`Midwest`))
df$`Great Plains` <- as.numeric(factor(df$`Great Plains`))
df$`Deep South` <- as.numeric(factor(df$`Deep South`))
df$`South Atlantic` <- as.numeric(factor(df$`South Atlantic`))
df$`Mountain` <- as.numeric(factor(df$`Mountain`))
df$`Pacific` <- as.numeric(factor(df$`Pacific`))
```
```{r}
#Test the hypothesis that the proportion of stores carries Fuente's products is the same across the US sales regions.
#Stacking the dataframe
df2 <-stack(df[1:8])
df4 <-c(0.3481, 0.1730, 0.4788, 0.231, 0.45, 0.65, .43, .21 )
#Changing the column names
colnames(df2)[1] <- "Penetration"
colnames(df2)[2] <- "Region"
#The R function aov() can be used to answer to this question
res.aov <- aov(Penetration ~ Region, data = df2)
summary(res.aov)
```
As the p-value is less than the significance level 0.05, we can conclude that there are significant differences between the regions highlighted with “**" in the model summary.
```{r}
#Marascuilo Procedure
## Set the proportions of interest.
p = df4
N = 8
value = critical.range = tag = c()
categories <- c("New England", "Mid Atlantic", "Midwest", "Great Plains", "South Atlantic", "Deep South", "Mountain",
"Pacific")
## Compute critical values.
for (i in 1:(N-1)){
for (j in (i+1):N){
value <- c(value,(abs(p[i]-p[j])))
critical.range = c(critical.range,
sqrt(qchisq(.95,N-1))*sqrt(p[i]*(1-p[i])/12000 + p[j]*(1-p[j])/12000))
tag = c(tag, paste(categories[i], categories[j], sep = "-"))
}
}
df3 <- as.data.frame(cbind(value,critical.range, tag), stringsAsFactors = F)
df3$value <- round(as.numeric(df3$value),3)
df3$critical.range <- round(as.numeric(df3$critical.range),3)
df3
```