package saga

  1. Overview
  2. Docs

Module Saga_tokenizers.UnicodeSource

Unicode utilities.

Sourcetype normalization =
  1. | NFC
  2. | NFD
  3. | NFKC
  4. | NFKD
Sourcetype char_category =
  1. | Letter
  2. | Number
  3. | Punctuation
  4. | Symbol
  5. | Whitespace
  6. | Control
  7. | Other
Sourceval categorize_char : Uchar.t -> char_category
Sourceval is_whitespace : Uchar.t -> bool
Sourceval is_punctuation : Uchar.t -> bool
Sourceval is_word_char : Uchar.t -> bool
Sourceval is_cjk : Uchar.t -> bool
Sourceval normalize : normalization -> string -> string
Sourceval case_fold : string -> string
Sourceval strip_accents : string -> string
Sourceval clean_text : ?remove_control:bool -> ?normalize_whitespace:bool -> string -> string
Sourceval split_words : string -> string list
Sourceval grapheme_count : string -> int
Sourceval is_valid_utf8 : string -> bool
Sourceval remove_emoji : string -> string