Install the development version of protoClassification from GitHub with:
# install.packages("pak")
pak::pak("acastroaraujo/protoClassification")Get Started
To simulate a dataset you need to create to decide a couple of things first.
- The number of dimensions.
- The marginal probabilities for each dimension.
- A correlation matrix for the dimensions.
library(protoClassification)
set.seed(1)
K <- 6 # 1st step
marginals <- rbeta(K, 2, 2) # 2nd step
rho <- rlkjcorr(1, K, eta = 1) # 3rd stepGenerate data.
set.seed(1)
sim_data <- make_binary_data(marginals, rho, obs = 1e3)
sim_data
#>
#> ── Data ──
#>
#> 1000 obs. of 6 variables:
#> $ x1: int 0 1 0 1 0 1 0 1 0 1 ...
#> $ x2: int 0 1 0 1 1 1 0 1 1 1 ...
#> $ x3: int 0 1 1 0 1 0 0 0 1 0 ...
#> $ x4: int 1 1 1 1 1 1 1 1 1 1 ...
#> $ x5: int 1 1 1 1 0 0 1 1 1 1 ...
#> $ x6: int 0 1 0 0 1 0 0 1 0 1 ...
#>
#> ── Parameters ──
#>
#> ── Marginal Probabilities:
#> x1 x2 x3 x4 x5 x6
#> 0.33 0.55 0.27 0.88 0.59 0.28
#>
#>
#> ── Correlation Matrix:
#> x1 x2 x3 x4 x5 x6
#> x1 1.00 0.22 0.29 0.15 -0.02 0.36
#> x2 0.22 1.00 -0.08 -0.21 -0.01 0.37
#> x3 0.29 -0.08 1.00 -0.75 0.34 0.08
#> x4 0.15 -0.21 -0.75 1.00 -0.20 -0.34
#> x5 -0.02 -0.01 0.34 -0.20 1.00 -0.03
#> x6 0.36 0.37 0.08 -0.34 -0.03 1.00Note. The parameters are stored as the params attribute in the output.
We can verify that the column means roughly correspond to the marginal probabilities.
colMeans(sim_data)
#> x1 x2 x3 x4 x5 x6
#> 0.326 0.554 0.274 0.899 0.588 0.275In order to verify that the data follows the correlation structure in rho you would have to calculate a “tetrachoric correlation.”
psych::tetrachoric(sim_data)$rho
#> x1 x2 x3 x4 x5 x6
#> x1 1.00000000 0.23375492 0.28118697 0.1442114 -0.01192531 0.40365755
#> x2 0.23375492 1.00000000 -0.13315127 -0.1448709 0.03404136 0.38467926
#> x3 0.28118697 -0.13315127 1.00000000 -0.7612102 0.40485490 0.07792155
#> x4 0.14421140 -0.14487091 -0.76121019 1.0000000 -0.31759978 -0.34664697
#> x5 -0.01192531 0.03404136 0.40485490 -0.3175998 1.00000000 -0.08953172
#> x6 0.40365755 0.38467926 0.07792155 -0.3466470 -0.08953172 1.00000000Additional stuff for Prototype Classification Model:
wa vector of attention weights for each kPa list of prototypes, one per category.g(gamma) sensitivity parameter.
Calculate distance and similarity for one prototype at a time:
d <- calculateDistSim(
data = sim_data,
P = rep(1, K),
w = w,
g = g
)
str(d)
#> 'data.frame': 1000 obs. of 2 variables:
#> $ distance : num 0.655 0 0.477 0.457 0.145 ...
#> $ similarity: num 0.00143 1 0.00846 0.01035 0.23423 ...Calculate distance, similarity, and probabilities for multiple prototypes at the same time:
prototypes <- list(
P1 = rep(1, K),
P2 = rep(0, K),
P3 = rep(1:0, K / 2)
)
g <- rep(10, 3)
out <- compute(sim_data, prototypes, w, g)
out
#>
#> ── Output ──
#>
#> $ distance 1000 obs. of 3 variables
#> $ similarity 1000 obs. of 3 variables
#> $ probabilities 1000 obs. of 3 variables
#> $ data 1000 obs. of 6 variables
#>
#> ── Prototypes ──
#>
#> $ P1: num [1:6] 1 1 1 1 1 1
#> $ P2: num [1:6] 0 0 0 0 0 0
#> $ P3: int [1:6] 1 0 1 0 1 0
#>
#> ── Distance ──
#>
#> Manhattan (r = 1)
#>
#> ── Sensitivity ──
#>
#> g1 g2 g3
#> 10 10 10
#>
#> ── Attention Weights ──
#>
#> w1 w2 w3 w4 w5 w6
#> 0.082 0.116 0.178 0.282 0.063 0.279
#>
#> ── Marginal Probabilities ──
#>
#> ── `colMeans(.$data)`
#> x1 x2 x3 x4 x5 x6
#> 0.326 0.554 0.274 0.899 0.588 0.275
#>
#> ── `colMeans(.$probabilities)`
#> C1 C2 C3
#> 0.3883449 0.4360664 0.1755887consolidate() the previous output into a single data frame for easier visualization.
d <- consolidate(out)
str(d)
#> 'data.frame': 1000 obs. of 15 variables:
#> $ prob1: num 0.0379 0.9988 0.212 0.5783 0.998 ...
#> $ prob2: num 8.45e-01 4.53e-05 1.34e-01 2.45e-01 8.26e-04 ...
#> $ prob3: num 0.11692 0.00115 0.65354 0.17654 0.00115 ...
#> $ sim1 : num 0.00143 1 0.00846 0.01035 0.23423 ...
#> $ sim2 : num 3.18e-02 4.54e-05 5.36e-03 4.39e-03 1.94e-04 ...
#> $ sim3 : num 0.0044 0.001149 0.026083 0.003159 0.000269 ...
#> $ dist1: num 0.655 0 0.477 0.457 0.145 ...
#> $ dist2: num 0.345 1 0.523 0.543 0.855 ...
#> $ dist3: num 0.543 0.677 0.365 0.576 0.822 ...
#> $ x1 : int 0 1 0 1 0 1 0 1 0 1 ...
#> $ x2 : int 0 1 0 1 1 1 0 1 1 1 ...
#> $ x3 : int 0 1 1 0 1 0 0 0 1 0 ...
#> $ x4 : int 1 1 1 1 1 1 1 1 1 1 ...
#> $ x5 : int 1 1 1 1 0 0 1 1 1 1 ...
#> $ x6 : int 0 1 0 0 1 0 0 1 0 1 ...Note. Since only binary data is implemented, there is no difference between Manhattan and Euclidean distance!
Marginal and Conditional Probabilities
So far, a single simulation requires the marginal probabilities for each element of to be specified at the outset.
colMeans(out$data) # cf. `marginals` argument in `make_binary_data()`
#> x1 x2 x3 x4 x5 x6
#> 0.326 0.554 0.274 0.899 0.588 0.275The more relevant piece of information we get from the compute() function is the .$probabilities object, which calculates the probability that any given individual in our simulated dataset will belong to each of the prototype categories.
This allows us to calculate the marginal probabilities for each category.
colMeans(out$probabilities)
#> C1 C2 C3
#> 0.3883449 0.4360664 0.1755887With some ingenuity, we can use this information to get conditional probabilities too.
conditionalProbs(out, "features")
#> x1 x2 x3 x4 x5 x6
#> C1 0.5226571 0.7802353 0.32826487 0.9332301 0.6171392 0.62588499
#> C2 0.1302299 0.4297247 0.05791891 0.9465871 0.4820181 0.03806980
#> C3 0.3770770 0.3621551 0.69020964 0.7052639 0.7862111 0.08786438
1 - conditionalProbs(out, "features")
#> x1 x2 x3 x4 x5 x6
#> C1 0.4769570 0.2200126 0.6714007 0.06717440 0.3829206 0.3734904
#> C2 0.8703417 0.5706125 0.9425701 0.05316343 0.5189671 0.9620408
#> C3 0.6227137 0.6362298 0.3103042 0.29413893 0.2117976 0.9130266
conditionalProbs(out, type = "categories")
#> $`Xk=0`
#> C1 C2 C3
#> x1 0.2743383 0.5632908 0.16237092
#> x2 0.1926099 0.5577848 0.24960538
#> x3 0.3591157 0.5664215 0.07446281
#> x4 0.2566337 0.2286733 0.51469307
#> x5 0.3608689 0.5489563 0.09017476
#> x6 0.2001655 0.5788414 0.22099310
#>
#> $`Xk=1`
#> C1 C2 C3
#> x1 0.6236135 0.17344785 0.20293865
#> x2 0.5456643 0.33832130 0.11601444
#> x3 0.4652701 0.09116788 0.44356204
#> x4 0.4029833 0.45951724 0.13749944
#> x5 0.4073537 0.35719728 0.23544898
#> x6 0.8839345 0.06015273 0.05591273Alternatively, it’s easier to use the summary() function to extract all conditional and marginal probabilities.
probs <- summary(out)
probs
#>
#> ── Categories ──
#>
#> ── Marginals:
#> C1 C2 C3
#> 0.388 0.436 0.176
#>
#> ── Conditionals:
#> $`Xk=0`
#> C1 C2 C3
#> x1 0.276 0.562 0.162
#> x2 0.193 0.558 0.249
#> x3 0.360 0.565 0.075
#> x4 0.258 0.231 0.511
#> x5 0.362 0.548 0.091
#> x6 0.202 0.578 0.220
#>
#> $`Xk=1`
#> C1 C2 C3
#> x1 0.624 0.173 0.202
#> x2 0.548 0.337 0.116
#> x3 0.467 0.092 0.441
#> x4 0.404 0.458 0.137
#> x5 0.409 0.357 0.234
#> x6 0.884 0.060 0.056
#>
#> ── Features ──
#>
#> ── Marginals:
#> x1 x2 x3 x4 x5 x6
#> 0.326 0.554 0.274 0.899 0.588 0.275
#>
#> ── Conditionals:
#> x1 x2 x3 x4 x5 x6
#> C1 0.523 0.779 0.329 0.933 0.618 0.625
#> C2 0.129 0.429 0.057 0.946 0.480 0.038
#> C3 0.378 0.364 0.691 0.705 0.788 0.088