Skip to content

Commit

Permalink
Merge pull request #58 from rleonid/mv_naive_bayes
Browse files Browse the repository at this point in the history
Multinomial version of Naive Bayes
  • Loading branch information
rleonid committed Jul 16, 2015
2 parents 8342533 + e3a576f commit 65ef8d3
Show file tree
Hide file tree
Showing 2 changed files with 110 additions and 37 deletions.
111 changes: 89 additions & 22 deletions src/lib/classify.ml
Original file line number Diff line number Diff line change
Expand Up @@ -58,10 +58,13 @@ let eval ?(bernoulli=false) nb b =

let within a b x = max a (min x b)

type smoothing =
{ factor : float
; feature_space_size : int array
}
let smoothing_to_prob = function
| None ->
(fun count bkgrnd _ -> count /. bkgrnd)
| Some sf ->
let sf = within 0.0 1.0 sf in
(fun count bkgrnd space_size ->
(count +. sf) /. (bkgrnd +. sf *. space_size))

let estimate ?smoothing ?(classes=[]) ~feature_size to_ftr_arr data =
if data = [] then
Expand Down Expand Up @@ -96,29 +99,16 @@ let estimate ?smoothing ?(classes=[]) ~feature_size to_ftr_arr data =
in
let totalf = float total in
let cls_sz = float (List.length all) in
let to_prior_prob, to_lkhd_prob =
match smoothing with
| None ->
(fun count bkgrnd _ -> count /. bkgrnd),
(fun count bkgrnd _ -> count /. bkgrnd)
| Some s ->
(* TODO: Issue warning? Fail? *)
let sf = within 0.0 1.0 s.factor in
let fss = Array.map float s.feature_space_size in
(fun count bkgrnd space_size ->
(count +. sf) /. (bkgrnd +. sf *. space_size)),
(fun count bkgrnd idx ->
(count +. sf) /. (bkgrnd +. sf *. fss.(idx)))
in
let to_prob = smoothing_to_prob smoothing in
let table =
List.map all ~f:(fun (cl, attr_count) ->
let prior_count = float attr_count.(feature_size) in
let likelihood =
Array.init aa (fun i ->
if i = feature_size then (* Store the prior at the end. *)
to_prior_prob prior_count totalf cls_sz
to_prob prior_count totalf cls_sz
else
to_lkhd_prob (float attr_count.(i)) prior_count i)
to_prob (float attr_count.(i)) prior_count 2.0) (* Binary. *)
in
cl, likelihood)
in
Expand All @@ -127,6 +117,83 @@ let estimate ?smoothing ?(classes=[]) ~feature_size to_ftr_arr data =
; features = feature_size
}

type ('cls, 'ftr) naive_bayes_mv =
{ table : ('cls * (float * float array array)) list
; to_feature_array : 'ftr -> int array
; feature_sizes : int array
}

let class_probabilities_mv mvnb cls =
let (prior, likelihood_arr) = List.assoc cls mvnb.table in
(fun ftr ->
prior,
Array.map2 (fun i lk_arr -> lk_arr.(i)) (mvnb.to_feature_array ftr) likelihood_arr)

let eval_mv mvnb feature =
let evidence = ref 0.0 in
let indices = mvnb.to_feature_array feature in
let to_likelihood arr = prod_arr2 (fun i lk_arr -> lk_arr.(i)) indices arr in
let byc =
List.map mvnb.table ~f:(fun (c, (prior, class_probs)) ->
let likelihood = to_likelihood class_probs in
let prob = prior *. likelihood in
evidence := !evidence +. prob;
(c, prob))
in
List.map byc ~f:(fun (c, prob) -> (c, prob /. !evidence))

let assoc_opt ~default f lst =
try
let g = List.assoc f lst in
let r = List.remove_assoc f lst in
g, r
with Not_found ->
default (), lst

let estimate_mv ?smoothing ?(classes=[]) ~feature_sizes to_ftr_arr data =
if data = [] then
invalidArg "Classify.estimate_mv: Nothing to train on"
else
let update arr feature =
let ftr_arr = to_ftr_arr feature in
Array.iteri (fun i j -> arr.(i).(j) <- arr.(i).(j) + 1) ftr_arr
in
let new_arr () = Array.map (fun i -> Array.make i 0) feature_sizes in
let init_lst, default =
match classes with
| [] -> [], (fun () -> 0, new_arr ())
| cl -> List.map (fun c -> c, (0, new_arr ())) cl,
fun () -> invalidArg "Classify.estimate_mv classes have been specified."
in
let (total, all) =
List.fold_left data
~f:(fun (total, asc) (label, feature) ->
let (p, fr), n_asc = assoc_opt ~default label asc in
update fr feature;
total + 1, ((label, (p + 1, fr)) :: n_asc))
~init:(0, init_lst)
in
let to_prob = smoothing_to_prob smoothing in
let totalf = float total in
let numcls = float (List.length all) in
let table =
List.map all ~f:(fun (cl, (class_count, attr_count)) ->
let prior = to_prob (float class_count) totalf numcls in
let likelihood =
Array.map (fun arr ->
let farr = Array.map float arr in
let lsum = Array.sumf farr in
let fssf = float (Array.length arr) in
Array.map (fun c -> to_prob c lsum fssf) farr)
attr_count
in
cl, (prior, likelihood))
in
{ table
; to_feature_array = to_ftr_arr
; feature_sizes
}

type 'a gauss_bayes =
{ table : ('a * float * (float * float) array) list
; features : int
Expand Down Expand Up @@ -242,7 +309,7 @@ let log_reg_estimate ~class_f data =
let clss =
List.map (fun (c, _) ->
(* TODO: are there better choices for these? *)
let cc = if class_f c then 1 else -1 in
let cc = if class_f c then 1 else -1 in
if not (List.mem_assoc c !classes) then classes := (c, cc) :: !classes;
cc) data
|> Array.of_list
Expand All @@ -259,7 +326,7 @@ let log_reg_estimate ~class_f data =
; weights
; classes = !classes
}

type binary =
{ predicted : bool
; probability : float
Expand Down
36 changes: 21 additions & 15 deletions src/lib/classify.mli
Original file line number Diff line number Diff line change
Expand Up @@ -17,28 +17,23 @@ type ('cls, 'ftr) naive_bayes
@raise Not_found if [bayes] never trained on [class]. *)
val class_probabilities : ('cls, 'ftr) naive_bayes -> 'cls -> float * float array

(** When estimating a probability distribution by counting observed instances
in the feature space we may want to smooth the values, particularly if our
training data is sparse.
[http://en.wikipedia.org/wiki/Additive_smoothing]
*)
type smoothing =
{ factor : float (** Multiplicative factor *)
; feature_space_size : int array (** Size of the space of each feature.
Must be at least [feature_size] long.*)
}

(** [estimate smoothing classes feature_size to_feature_array training_data]
trains a discrete Naive Bayes classifier based on the [training_data].
[to_feature_array] maps a feature to an integer array of indices of in
the feature_space bounded by \[0,feature_size\). Optionally, [classes]
supplies all the classes to learn or they're aggregated from observations
in data (this is useful, for classes that may be 'missing' in the data and
smoothing is applied. Additive [smoothing] can be applied to the final
estimates if provided.
smoothing is applied.
Additive [smoothing] can be applied to the final estimates if provided.
When estimating a probability distribution by counting observed instances
in the feature space we may want to smooth the values, particularly if our
training data is sparse.
[http://en.wikipedia.org/wiki/Additive_smoothing]
*)
val estimate : ?smoothing:smoothing -> ?classes:'cls list ->
val estimate : ?smoothing:float -> ?classes:'cls list ->
feature_size:int -> ('ftr -> int array) -> ('cls * 'ftr) list ->
('cls, 'ftr) naive_bayes

Expand All @@ -50,6 +45,17 @@ val estimate : ?smoothing:smoothing -> ?classes:'cls list ->
*)
val eval : ?bernoulli:bool -> ('cls, 'ftr) naive_bayes -> 'ftr -> 'cls probabilities

type ('cls, 'ftr) naive_bayes_mv

val class_probabilities_mv : ('cls, 'ftr) naive_bayes_mv -> 'cls -> ('ftr -> float * float array)

val estimate_mv : ?smoothing:float -> ?classes:'cls list ->
feature_sizes:int array -> ('ftr -> int array) ->
('cls * 'ftr) list -> ('cls, 'ftr) naive_bayes_mv

val eval_mv : ('cls, 'ftr) naive_bayes_mv -> 'ftr -> 'cls probabilities


(** A continuous Gaussian Naive Bayes classifier of class ['cls]. The
feature space is assumed to be a float array. *)
type 'cls gauss_bayes
Expand Down

0 comments on commit 65ef8d3

Please sign in to comment.