A probability is a number that describes the “magnitude of chance” associated with making a particular observation or statement.
It’s always a number between 0 and 1 (inclusive) and is often expressed as a fraction.
X.outcomes <- c(2:12)
X.prob <- c((1/36),(2/36),(3/36),(4/36),(5/36),(6/36),(5/36),(4/36),(3/36),(2/36),(1/36))
barplot(X.prob,ylim=c(0,0.20),names.arg=X.outcomes,space=0,xlab="x",ylab="Pr(X = x)", main = "probability distribution")
X.outcomes <- c(2:12)
X.prob <- c((1/36),(2/36),(3/36),(4/36),(5/36),(6/36),(5/36),(4/36),(3/36),(2/36),(1/36))
X.cumul <- cumsum(X.prob)
barplot(X.cumul,names.arg=X.outcomes,space=0,xlab="x",ylab="Pr(X <= x)", main = "cumulative probability distribution")
X.outcomes <- c(2:12)
X.prob <- c((1/36),(2/36),(3/36),(4/36),(5/36),(6/36),(5/36),(4/36),(3/36),(2/36),(1/36))
barplot(X.prob,ylim=c(0,0.20),names.arg=X.outcomes,space=0,xlab="x",ylab="Pr(X = x)", main = "probability distribution")
abline(v=c(0.5:10.5))
lower < 7 < upper
X >= 2 & X <= 7
(X[lower] - 1)/36
X > 7 & X <= 12
13 - X[upper])/36
X.outcomes <- c(1,2,3,4,5,6,7,8,9,10,11,12,13)
lower <- X.outcomes >= 2 & X.outcomes <= 7
upper <- X.outcomes > 7 & X.outcomes <= 12
fx <- rep(0,length(X.outcomes))
fx[lower] <- (X.outcomes[lower] - 1)/36
fx[upper] <- (13 - X.outcomes[upper])/36
plot(X.outcomes,fx,type="l",ylab="f(x)", xlim = c(0,14), main = "probability density function")
abline(h=0,col="gray",lty=2)
fx.specific <- (4.5-1)/36
fx.specific.area <- 3.5*fx.specific*0.5
fx.specific.vertices <- rbind(c(1,0),c(4.5,0),c(4.5,fx.specific))
plot(X.outcomes,fx,type="l",ylab="f(x)", xlim = c(0,14), main = "probability density function")
abline(h=0,col="gray",lty=2)
polygon(fx.specific.vertices,col="gray",border=NA)
abline(v=4.5,lty=3)
text(4,0.01,labels=fx.specific.area)
Symmetry : Draw a vertical line down the center, and it is equally reflected with 0.5 probability.
Skew : If a distribution is asymmetric, look at the “tail” of a distribution. Positive or right skew indicates a tail extending longer to the right of center.
Modality : Modality describes the number of easily identifiable peaks in the distribution of interest. Unimodal, bimodal, and trimodal…
station_data <- read.table("https://web.itu.edu.tr/~tokerem/18397_Cekmekoy_Omerli_15dk.txt", sep = ";", header = T)
table(station_data$temp)
##
## 19.2 19.5 20.1 20.4 20.5 20.6 20.7 20.8 20.9 21 21.2 21.4 21.6 21.7 21.9
## 1 1 1 4 6 4 2 6 2 1 2 3 1 2 1
## 22.1 22.2 22.3 22.4 22.5 22.6 22.7 22.8 23 23.1 23.2 23.6 23.8 23.9 24.2
## 1 5 3 3 8 7 1 1 2 1 3 1 2 2 2
## 25.1 25.4 25.5 25.6 25.8 26.1 26.2 26.6 26.9 27.1 27.4 27.6 27.8 28 28.4
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2
## 28.5 28.8 29 29.2 29.3 29.4 29.5 29.6 30 30.1 30.2 30.4 30.8 30.9 31
## 1 1 2 2 2 1 1 1 1 2 1 4 1 3 1
## 31.2 31.5
## 1 1
df_temp_table <- data.frame(table(station_data$temp))
df_temp_table
## Var1 Freq
## 1 19.2 1
## 2 19.5 1
## 3 20.1 1
## 4 20.4 4
## 5 20.5 6
## 6 20.6 4
## 7 20.7 2
## 8 20.8 6
## 9 20.9 2
## 10 21 1
## 11 21.2 2
## 12 21.4 3
## 13 21.6 1
## 14 21.7 2
## 15 21.9 1
## 16 22.1 1
## 17 22.2 5
## 18 22.3 3
## 19 22.4 3
## 20 22.5 8
## 21 22.6 7
## 22 22.7 1
## 23 22.8 1
## 24 23 2
## 25 23.1 1
## 26 23.2 3
## 27 23.6 1
## 28 23.8 2
## 29 23.9 2
## 30 24.2 2
## 31 25.1 1
## 32 25.4 1
## 33 25.5 1
## 34 25.6 1
## 35 25.8 1
## 36 26.1 1
## 37 26.2 1
## 38 26.6 1
## 39 26.9 1
## 40 27.1 1
## 41 27.4 1
## 42 27.6 1
## 43 27.8 1
## 44 28 1
## 45 28.4 2
## 46 28.5 1
## 47 28.8 1
## 48 29 2
## 49 29.2 2
## 50 29.3 2
## 51 29.4 1
## 52 29.5 1
## 53 29.6 1
## 54 30 1
## 55 30.1 2
## 56 30.2 1
## 57 30.4 4
## 58 30.8 1
## 59 30.9 3
## 60 31 1
## 61 31.2 1
## 62 31.5 1
barplot(df_temp_table$Freq/121,names.arg=df_temp_table$Var1)
For discrete random variables
x<-1
p <- 0.6
b_fx <- p^x*((1-p)^(1-x))
barplot(c(1-p,p),names.arg=c(0,1))
For discrete random variables
There are four functions associated with Binomial distributions.
It is a density or distribution function.
x <- 1
size <- 8
prob <- 1/2
dbinom(x , size , prob)
## [1] 0.03125
x <- 4
dbinom(x , size , prob)
## [1] 0.2734375
x <- 0:8
dbinom(x , size , prob)
## [1] 0.00390625 0.03125000 0.10937500 0.21875000 0.27343750 0.21875000
## [7] 0.10937500 0.03125000 0.00390625
bin <- dbinom(x = 0:8 , size = 8 , prob = 0.5)
plot(x=0:8, y = bin)
X.outcomes <- c(1:13)
X.prob <- c((0/36),(1/36),(2/36),(3/36),(4/36),(5/36),(6/36),(5/36),(4/36),(3/36),(2/36),(1/36),(0/36))
barplot(X.prob,ylim=c(0,0.20),names.arg=X.outcomes,space=0,xlab="x",ylab="Pr(X = x)", main = "probability distribution")
X.outcomes <- c(1:13)
X.prob <- c((0/36),(1/36),(2/36),(3/36),(4/36),(5/36),(6/36),(5/36),(4/36),(3/36),(2/36),(1/36),(0/36))
barplot(X.prob,ylim=c(0,0.20),names.arg=X.outcomes,space=0,xlab="x",ylab="Pr(X = x)", main = "probability distribution")
lines(dbinom(x = 0:12, size = 36, prob = 1/6), col= "red")
λp should be interpreted as the “mean number of occurrences”
There are three functions associated with Binomial distributions.
plot(dpois(0:10,2.22),type = "o", col="red")
lines(dpois(0:10,4.22), type = "o", col = "blue")
lines(dpois(0:10,7.22), type = "o", col = "green")
R - Common Probability Density Functions
- Uniform
- Normal
- Student’s t-distribution
- Exponential
- (gamma, beta, log-normal)
The uniform distribution is a simple density function that describes a continuous random variable whose interval of possible values offers no fluctuations in probability.
runif(n = 10,-0.4,1.1)
## [1] 0.9105006 0.9006214 -0.2830141 0.3968495 0.9319934 0.2151849
## [7] 0.5315867 0.8936403 0.7476632 0.6157447
r1 <- runif(n = 10,-0.4,1.1)
table(r1)
## r1
## -0.398188538011163 -0.365233318158425 -0.154235270083882
## 1 1 1
## -0.144507477036677 -0.0904597032116726 0.159710376197472
## 1 1 1
## 0.319758883002214 0.473805779847316 0.615361024765298
## 1 1 1
## 1.03159253790509
## 1
t1 <- table(r1)
barplot(t1)
barplot(table(runif(n = 100,-0.4,1.1)))
barplot(table(runif(n = 1000,-0.4,1.1)))
dunif(x=c(-2,-0.33,0,0.5,1.05,1.2),min=-0.4,max=1.1)
## [1] 0.0000000 0.6666667 0.6666667 0.6666667 0.6666667 0.0000000
d1 <- dunif(x=c(-2,-0.33,0,0.5,1.05,1.2),min=-0.4,max=1.1)
barplot(d1,names.arg=c(-2,-0.33,0,0.5,1.05,1.2))
d2 <- dunif(x=c(-2,runif(998,-0.4,1.1),1.2),min=-0.4,max=1.1)
barplot(d2)
Characterized by a distinctive “bell-shaped” curve, it’s also referred to as the Gaussian distribution.
0.95 −2σ to +2σ and 0.99 −3σ tp +3σ
r1 <- rnorm(50,mean = 0, sd = 1)
r1
## [1] 0.63191378 1.10297821 0.27035605 0.43025990 0.38154373
## [6] 1.02321963 2.05586228 -2.75031632 -1.72808041 0.06489684
## [11] 1.58526306 -1.44759829 0.83461985 -0.18791933 -0.91344045
## [16] -0.73132253 0.74783234 -0.93842306 -0.29665193 0.68230962
## [21] 0.40157539 0.27835772 0.14834538 0.43097822 0.01371666
## [26] 2.08231240 -1.10497277 -0.29974305 0.51615597 0.15184775
## [31] -0.20615546 1.91093318 1.85923126 0.31599771 -0.23181942
## [36] 1.66971293 -0.84868078 0.58042305 -0.04699391 0.16519299
## [41] -1.54183815 -0.38155744 -1.32150222 -0.37041601 0.44044496
## [46] 1.13541998 -0.05831687 -0.30426056 1.34203867 0.46170168
plot(r1)
hist(r1)
barplot(r1)
barplot(table(r1))
dnorm(r1, mean = 0, sd = 1)
## [1] 0.326738197 0.217138692 0.384625660 0.363672943 0.370935775
## [6] 0.236353258 0.048208333 0.009085655 0.089629615 0.398103072
## [11] 0.113554996 0.139916574 0.281609567 0.391960045 0.262862221
## [16] 0.305332219 0.301626694 0.256851430 0.381768942 0.316095233
## [21] 0.368037689 0.383782212 0.394576710 0.363560468 0.398904752
## [26] 0.045640902 0.216661090 0.381417203 0.349187229 0.394369338
## [31] 0.390554182 0.064262994 0.070841593 0.379513235 0.388365390
## [36] 0.098972903 0.278296532 0.337097190 0.398502006 0.393535935
## [41] 0.121532815 0.370933835 0.166606157 0.372490947 0.362063955
## [46] 0.209396059 0.398264484 0.380897192 0.162111255 0.358608952
fx3 <- dnorm(r1)
plot(r1,fx3)
foo <- c(0.6,-0.6,0.1,-0.2,-1.0,0.4,0.3,-1.8,1.1,6.0)
summary(foo)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.80 -0.50 0.20 0.49 0.55 6.00
plot(foo,rep(0,10),yaxt="n",ylab="",bty="n",cex=2,cex.axis=1.5,cex.lab=1.5)
abline(h=0,col="gray",lty=2)
plot(foo,rep(0,10),yaxt="n",ylab="",bty="n",cex=2,cex.axis=1.5,cex.lab=1.5)
abline(h=0,col="gray",lty=2)
arrows(5,0.5,5.9,0.1,lwd=2)
text(5,0.7,labels="outlier?",cex=3)
boxplot()
boxplot(foo)
mean(foo)
## [1] 0.49
mean(foo[-10])
## [1] -0.1222222
baz <- c(-0.3,0.9,2.8,2.3,1.2,12,-4.1,-0.4,4.1,-2.3)
Condition for outliers is
OUTLIERS < MEAN-(3*
IQR)
or
OUTLIERS > MEAN+(3*
IQR)
baz <- c(-0.3,0.9,2.8,2.3,1.2,12,-4.1,-0.4,4.1,-2.3,1.2)
statistic_function <- function(baz) {
}
statistic_function <- function(baz) {
print(c(mean(baz),"mean"))
print(c(median(baz),"median"))
print(c(range(baz),"range"))
}
statistic_function(baz)
## [1] "1.58181818181818" "mean"
## [1] "1.2" "median"
## [1] "-4.1" "12" "range"
statistic_function <- function(baz) {
print(c(mean(baz),"mean"))
print(c(median(baz),"median"))
print(c(range(baz),"range"))
print(c(var(baz),"var"))
print(c(sd(baz),"sd"))
}
statistic_function(baz)
## [1] "1.58181818181818" "mean"
## [1] "1.2" "median"
## [1] "-4.1" "12" "range"
## [1] "17.2456363636364" "var"
## [1] "4.15278657814682" "sd"
statistic_function <- function(baz) {
print(c(mean(baz),"mean"))
print(c(median(baz),"median"))
print(c(range(baz),"range"))
print(c(var(baz),"var"))
print(c(sd(baz),"sd"))
plot(baz)
hist(baz)
}
statistic_function(baz)
## [1] "1.58181818181818" "mean"
## [1] "1.2" "median"
## [1] "-4.1" "12" "range"
## [1] "17.2456363636364" "var"
## [1] "4.15278657814682" "sd"
statistic_function <- function(baz) {
print(c(mean(baz),"mean"))
print(c(median(baz),"median"))
print(c(range(baz),"range"))
print(c(var(baz),"var"))
print(c(sd(baz),"sd"))
plot(baz)
hist(baz)
barplot(table(baz))
print(c(summary(baz),"summary"))
boxplot(baz)
}
statistic_function(baz)
## [1] "1.58181818181818" "mean"
## [1] "1.2" "median"
## [1] "-4.1" "12" "range"
## [1] "17.2456363636364" "var"
## [1] "4.15278657814682" "sd"
## Min. 1st Qu. Median
## "-4.1" "-0.35" "1.2"
## Mean 3rd Qu. Max.
## "1.58181818181818" "2.55" "12"
##
## "summary"
statistic_function <- function(baz) {
print(c(mean(baz),"mean"))
print(c(median(baz),"median"))
print(c(range(baz),"range"))
print(c(var(baz),"var"))
print(c(sd(baz),"sd"))
plot(baz)
hist(baz)
barplot(table(baz))
print(c(summary(baz),"summary"))
boxplot(baz)
for (i in 1:length(baz)) {
if (baz[i]<mean(baz)-3*IQR(baz) | baz[i]>mean(baz)+3*IQR(baz)) {
print("there is outliers")
print(c(baz[i] , "outlier") )
print(which(baz==baz[i]))
}
}
}
statistic_function(baz)
## [1] "1.58181818181818" "mean"
## [1] "1.2" "median"
## [1] "-4.1" "12" "range"
## [1] "17.2456363636364" "var"
## [1] "4.15278657814682" "sd"
## Min. 1st Qu. Median
## "-4.1" "-0.35" "1.2"
## Mean 3rd Qu. Max.
## "1.58181818181818" "2.55" "12"
##
## "summary"
## [1] "there is outliers"
## [1] "12" "outlier"
## [1] 6