Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Option to break between apostrophe and vowels #2

Open
vouillon opened this issue Jan 27, 2016 · 1 comment
Open

Option to break between apostrophe and vowels #2

vouillon opened this issue Jan 27, 2016 · 1 comment

Comments

@vouillon
Copy link

It might be interesting to have the option to use rule WB5a of UAX 29, for French and Italian texts.

@dbuenzli
Copy link
Owner

I'm unsure yet if I want to provide this directly in uuseg or part of a larger language aware text processing framework.

For now you should be able to play with something like this:

module Uset = struct
  include Set.Make (Uucp.Uchar)
  let of_list us = List.fold_left (fun acc v -> add v acc) empty us
end

(* Segments words according to UAX 29 + WB5a on NFD
   WB5 apostrophe ÷ vowels *)

type word_wb5a_state = Buf | Apos | Normal
type word_wb5a =
  { word : Uuseg.t;
    vowels : Uset.t;
    mutable state : word_wb5a_state;
    mutable buf : [`Uchar of int]; }

let word_wb5a vowels =
  let create () =
    { word = Uuseg.create `Word; vowels; state = Normal;
      buf = `Uchar 0x0000 }
  in
  let copy s = { s with word = Uuseg.copy s.word } in
  let add s v = match s.state with
  | Buf ->
      if v <> `Await then Uuseg.err_exp_await v;
      s.state <- Normal; (s.buf :> Uuseg.ret)
  | _ ->
      match Uuseg.add s.word v with
      | `Uchar u as v ->
          begin match u with
          | 0x0027 (* APOSTROPHE *)
          | 0x2019 (* RIGHT SINGLE QUOTATION MARK *) ->
              s.state <- Apos; v
          | u when s.state = Apos && Uset.mem u s.vowels ->
              s.state <- Buf; s.buf <- v; `Boundary
          | u -> s.state <- Normal; v
          end
      | v -> v
  in
  Uuseg.custom ~name:"Reach_text.word_wb5a" ~create ~copy ~add ()

let french_vowels = Uset.of_list
      [ 0x0061 (* a *); 0x0065 (* e *); 0x0069 (* i *); 0x006F (* o *);
        0x0075 (* u *); 0x0079 (* y *);
        0x0153 (* œ *); 0x00E6 (* æ *); 0x0048 (* h *);
        0x0041 (* A *); 0x0045 (* E *); 0x0049 (* I *); 0x004F (* o *);
        0x0055 (* U *); 0x0059 (* Y *);
        0x0152 (* Œ *); 0x00E7 (* Æ *); 0x0068 (* H *); ]

let (french_words : Uuseg.boundary) = `Custom (word_wb5a french_vowels)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

No branches or pull requests

2 participants