It is quite common to have to count the occurences of each element in a list. Python would propose the Counter
class, per example, and looking around, one would soon enough find the following snippet :
from collections import Counter
words = ['a', 'b', 'c', 'a']
Counter(words).keys() # equals to list(set(words))
Counter(words).values() # counts the elements' frequency
However, OCaml lacks the abundance of snippets to find here and there. This article will focus on counting the occurences of each element in a list, using various approaches.
A functional approach
A simple way relying on built-in List
module would be to use sort_unique
and count the element appearing in the list for each element returned by sort_unique
.
let count_unique_elements_naive list =
let count_element e list = List.filter (fun x -> x = e) list |> List.length in
List.sort_uniq Int.compare list
|> List.map (fun e -> (e, count_element e list))
The following could be put in a count.ml
file in order to test the previous snippet:
let print_int_pair_list list =
let string_of_int_pair pair = match pair with
| a,b -> (string_of_int a)^" - "^(string_of_int b)^"\n" in
List.map string_of_int_pair list
|> List.iter print_string
let count_unique_elements_naive list =
let count_element e list = List.filter (fun x -> x = e) list |> List.length in
List.sort_uniq Int.compare list
|> List.map (fun e -> (e, count_element e list))
let () =
count_unique_elements_naive [1; 1; 3; 5; 7; 7; 1]
|> print_int_pair_list
Running ocaml count.ml
would output the following:
1 - 3
3 - 1
5 - 1
7 - 2
Obviously, this method makes as many passes on the list as the number of unique elements in the list, which is a waste of time.
Using Hash tables
Here hash tables comes-in handy. Updating a counter for each element of the list, one can achieve the same results in a single pass over the data.
Note that in recent versions (>= 4.07), it is easier to turn Hashtbl
s to list or seq, using the to_seq
functions.
let count_unique_elements_hashtbl list =
let counter = Hashtbl.create 10000 in
let update_counter x =
if Hashtbl.mem counter x then
let current_count = Hashtbl.find counter x in
Hashtbl.replace counter x (succ current_count)
else
Hashtbl.replace counter x 1
in
List.iter update_counter list;
Hashtbl.to_seq counter
|> List.of_seq
Using integer hash tables
As stated in the documentation:
The functorial interface allows the use of specific comparison and hash functions, either for performance/security concerns, or because keys are not hashable/comparable with the polymorphic builtins.
Performing a minor replacement in the code above, we can rewrite this as:
module IntHash =
struct
type t = int
let equal i j = i=j
let hash i = i land max_int
end
(* Here we create a specialized module for hash tables whose key is an integer *)
module IntHashtbl = Hashtbl.Make(IntHash)
let count_unique_elements_int_hashtbl list =
let counter = IntHashtbl.create 10000 in
let update_counter x =
if IntHashtbl.mem counter x then
let current_count = IntHashtbl.find counter x in
IntHashtbl.replace counter x (succ current_count)
else
IntHashtbl.replace counter x 1
in
List.iter update_counter list;
IntHashtbl.to_seq counter
|> List.of_seq
Benchmark
And the winner is count_unique_elements_int_hashtbl
.
$ ocamlopt main.ml
$ ./a.out
count_unique_elements_naive Execution time: 0.372140s
count_unique_elements_hashtbl Execution time: 0.089627s
count_unique_elements_int_hashtbl Execution time: 0.081218s
If one wants to reproduce the results:
let print_int_pair_list list =
let string_of_int_pair pair = match pair with
| a,b -> (string_of_int a)^" - "^(string_of_int b)^"\n" in
List.map string_of_int_pair list
|> List.iter print_string
let time f x =
let t = Sys.time() in
let fx = f x in
Printf.printf "Execution time: %fs\n" (Sys.time() -. t);
fx
let count_unique_elements_naive list =
let count_element e list = List.filter (fun x -> x = e) list |> List.length in
List.sort_uniq Int.compare list
|> List.map (fun e -> (e, count_element e list))
let count_unique_elements_hashtbl list =
let counter = Hashtbl.create 10000 in
let update_counter x =
if Hashtbl.mem counter x then
let current_count = Hashtbl.find counter x in
Hashtbl.replace counter x (succ current_count)
else
Hashtbl.replace counter x 1
in
List.iter update_counter list;
Hashtbl.to_seq counter
|> List.of_seq
module IntHash =
struct
type t = int
let equal i j = i=j
let hash i = i land max_int
end
(* Using the Hashtbl functorial interface *)
module IntHashtbl = Hashtbl.Make(IntHash)
let count_unique_elements_int_hashtbl list =
let counter = IntHashtbl.create 10000 in
let update_counter x =
if IntHashtbl.mem counter x then
let current_count = IntHashtbl.find counter x in
IntHashtbl.replace counter x (succ current_count)
else
IntHashtbl.replace counter x 1
in
List.iter update_counter list;
IntHashtbl.to_seq counter
|> List.of_seq
let sample = List.init 1000000 (fun _ -> Random.int 10)
let named_methods = [("count_unique_elements_naive", count_unique_elements_naive)
;("count_unique_elements_hashtbl", count_unique_elements_hashtbl)
;("count_unique_elements_int_hashtbl", count_unique_elements_int_hashtbl)]
let () =
let _ = List.map (fun pair -> print_string (fst pair); print_string "\t";time (snd pair) sample) named_methods in
()
Books
Real World OCaml: Functional programming for the masses by Yaron Minsky, Anil Madhavapeddy and Jason Hickey is a good introduction to OCaml.
Purely Functional Data Structures by Chris Okasaki, which presents another way to look at data structures, from a functional point of view. As a prerequisite, a knowledge of common structures such as Heaps, Linked Lists is needed.