Skip to content

Commit dd6fef4

Browse files
authored
Merge pull request #65 from Arkoniak/ying_yang_algorithm
Implementation of Yinyang algorithm
2 parents 6854197 + f871201 commit dd6fef4

File tree

9 files changed

+572
-38
lines changed

9 files changed

+572
-38
lines changed

docs/src/index.md

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,11 +56,14 @@ git checkout experimental
5656
- [X] Implementation of [Hamerly implementation](https://www.researchgate.net/publication/220906984_Making_k-means_Even_Faster).
5757
- [X] Interface for inclusion in Alan Turing Institute's [MLJModels](https://github.com/alan-turing-institute/MLJModels.jl#who-is-this-repo-for).
5858
- [X] Full Implementation of Triangle inequality based on [Elkan - 2003 Using the Triangle Inequality to Accelerate K-Means"](https://www.aaai.org/Papers/ICML/2003/ICML03-022.pdf).
59+
- [X] Implementation of [Yinyang K-Means: A Drop-In Replacement of the Classic K-Means
60+
with Consistent Speedup](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/ding15.pdf)
5961
- [ ] Implementation of [Geometric methods to accelerate k-means algorithm](http://cs.baylor.edu/~hamerly/papers/sdm2016_rysavy_hamerly.pdf).
6062
- [ ] Support for other distance metrics supported by [Distances.jl](https://github.com/JuliaStats/Distances.jl#supported-distances).
6163
- [ ] Native support for tabular data inputs outside of MLJModels' interface.
6264
- [ ] Refactoring and finalizaiton of API desgin.
6365
- [ ] GPU support.
66+
- [ ] Distributed calculations support.
6467
- [ ] Implementation of other K-Means algorithm variants based on recent literature.
6568
- [ ] Optimization of code base.
6669
- [ ] Improved Documentation
@@ -103,6 +106,7 @@ r.converged # whether the procedure converged
103106
- [Lloyd()](https://cs.nyu.edu/~roweis/csc2515-2006/readings/lloyd57.pdf)
104107
- [Hamerly()](https://www.researchgate.net/publication/220906984_Making_k-means_Even_Faster)
105108
- [Elkan()](https://www.aaai.org/Papers/ICML/2003/ICML03-022.pdf)
109+
- [Yinyang()](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/ding15.pdf)
106110
- [Geometric()](http://cs.baylor.edu/~hamerly/papers/sdm2016_rysavy_hamerly.pdf) - (Coming soon)
107111
- [MiniBatch()](https://www.eecs.tufts.edu/~dsculley/papers/fastkmeans.pdf) - (Coming soon)
108112

@@ -174,8 +178,10 @@ ________________________________________________________________________________
174178

175179
- 0.1.0 Initial release.
176180
- 0.1.1 Added interface for MLJ.
177-
- 0.1.2 Added Elkan algorithm.
181+
- 0.1.2 Added `Elkan` algorithm.
178182
- 0.1.3 Faster & optimized execution.
183+
- 0.1.4 Bug fixes
184+
- 0.1.5 Added `Yinyang` algorithm.
179185

180186
## Contributing
181187

src/ParallelKMeans.jl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,10 @@ include("kmeans.jl")
1212
include("lloyd.jl")
1313
include("hamerly.jl")
1414
include("elkan.jl")
15+
include("yinyang.jl")
1516
include("mlj_interface.jl")
1617

1718
export kmeans
18-
export Lloyd, Hamerly, Elkan
19+
export Lloyd, Hamerly, Elkan, Yinyang
1920

2021
end # module

src/elkan.jl

Lines changed: 0 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -236,21 +236,6 @@ function chunk_update_centroids(::Elkan, containers, centroids, X, r, idx)
236236
end
237237
end
238238

239-
function collect_containers(alg::Elkan, containers, n_threads)
240-
if n_threads == 1
241-
@inbounds containers.centroids_new[end] .= containers.centroids_new[1] ./ containers.centroids_cnt[1]'
242-
else
243-
@inbounds containers.centroids_new[end] .= containers.centroids_new[1]
244-
@inbounds containers.centroids_cnt[end] .= containers.centroids_cnt[1]
245-
@inbounds for i in 2:n_threads
246-
containers.centroids_new[end] .+= containers.centroids_new[i]
247-
containers.centroids_cnt[end] .+= containers.centroids_cnt[i]
248-
end
249-
250-
@inbounds containers.centroids_new[end] .= containers.centroids_new[end] ./ containers.centroids_cnt[end]'
251-
end
252-
end
253-
254239
function calculate_centroids_movement(alg::Elkan, containers, centroids)
255240
p = containers.p
256241
centroids_new = containers.centroids_new[end]
@@ -260,7 +245,6 @@ function calculate_centroids_movement(alg::Elkan, containers, centroids)
260245
end
261246
end
262247

263-
264248
function chunk_update_bounds(alg, containers, centroids, r, idx)
265249
p = containers.p
266250
lb = containers.lb

src/hamerly.jl

Lines changed: 3 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -73,23 +73,6 @@ function kmeans!(alg::Hamerly, containers, X, k;
7373
return KmeansResult(centroids, containers.labels, Float64[], Int[], Float64[], totalcost, niters, converged)
7474
end
7575

76-
77-
function collect_containers(alg::Hamerly, containers, n_threads)
78-
if n_threads == 1
79-
@inbounds containers.centroids_new[end] .= containers.centroids_new[1] ./ containers.centroids_cnt[1]'
80-
else
81-
@inbounds containers.centroids_new[end] .= containers.centroids_new[1]
82-
@inbounds containers.centroids_cnt[end] .= containers.centroids_cnt[1]
83-
@inbounds for i in 2:n_threads
84-
containers.centroids_new[end] .+= containers.centroids_new[i]
85-
containers.centroids_cnt[end] .+= containers.centroids_cnt[i]
86-
end
87-
88-
@inbounds containers.centroids_new[end] .= containers.centroids_new[end] ./ containers.centroids_cnt[end]'
89-
end
90-
end
91-
92-
9376
function create_containers(alg::Hamerly, k, nrow, ncol, n_threads)
9477
lng = n_threads + 1
9578
centroids_new = Vector{Array{Float64,2}}(undef, lng)
@@ -108,7 +91,7 @@ function create_containers(alg::Hamerly, k, nrow, ncol, n_threads)
10891

10992
labels = zeros(Int, ncol)
11093

111-
# distance that centroid moved
94+
# distance that centroid has moved
11295
p = Vector{Float64}(undef, k)
11396

11497
# distance from the center to the closest other center
@@ -289,9 +272,9 @@ function chunk_update_bounds(alg::Hamerly, containers, r1, r2, pr1, pr2, r, idx)
289272
label = labels[i]
290273
ub[i] += 2*sqrt(abs(ub[i] * p[label])) + p[label]
291274
if r1 == label
292-
lb[i] += pr2 - 2*sqrt(abs(pr2*lb[i]))
275+
lb[i] = lb[i] <= pr2 ? 0.0 : lb[i] + pr2 - 2*sqrt(abs(pr2*lb[i]))
293276
else
294-
lb[i] += pr1 - 2*sqrt(abs(pr1*lb[i]))
277+
lb[i] = lb[i] <= pr1 ? 0.0 : lb[i] + pr1 - 2*sqrt(abs(pr1*lb[i]))
295278
end
296279
end
297280
end

src/kmeans.jl

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,3 +159,19 @@ function kmeans(alg, design_matrix, k;
159159
k_init = k_init, max_iters = max_iters, tol = tol,
160160
verbose = verbose, init = init)
161161
end
162+
163+
164+
function collect_containers(::AbstractKMeansAlg, containers, n_threads)
165+
if n_threads == 1
166+
@inbounds containers.centroids_new[end] .= containers.centroids_new[1] ./ containers.centroids_cnt[1]'
167+
else
168+
@inbounds containers.centroids_new[end] .= containers.centroids_new[1]
169+
@inbounds containers.centroids_cnt[end] .= containers.centroids_cnt[1]
170+
@inbounds for i in 2:n_threads
171+
containers.centroids_new[end] .+= containers.centroids_new[i]
172+
containers.centroids_cnt[end] .+= containers.centroids_cnt[i]
173+
end
174+
175+
@inbounds containers.centroids_new[end] .= containers.centroids_new[end] ./ containers.centroids_cnt[end]'
176+
end
177+
end

0 commit comments

Comments
 (0)