package saga

  1. Overview
  2. Docs
Text processing and NLP extensions for Nx

Install

dune-project
 Dependency

Authors

Maintainers

Sources

raven-1.0.0.alpha2.tbz
sha256=93abc49d075a1754442ccf495645bc4fdc83e4c66391ec8aca8fa15d2b4f44d2
sha512=5eb958c51f30ae46abded4c96f48d1825f79c7ce03f975f9a6237cdfed0d62c0b4a0774296694def391573d849d1f869919c49008acffca95946b818ad325f6f

doc/saga.tokenizers/Saga_tokenizers/Unicode/index.html

Module Saga_tokenizers.UnicodeSource

Unicode utilities for normalization.

Sourcetype normalization =
  1. | NFC
  2. | NFD
  3. | NFKC
  4. | NFKD
Sourcetype char_category =
  1. | Letter
  2. | Number
  3. | Punctuation
  4. | Symbol
  5. | Whitespace
  6. | Control
  7. | Other
Sourceval categorize_char : Uchar.t -> char_category
Sourceval is_whitespace : Uchar.t -> bool
Sourceval is_punctuation : Uchar.t -> bool
Sourceval is_word_char : Uchar.t -> bool
Sourceval is_cjk : Uchar.t -> bool
Sourceval normalize : normalization -> string -> string
Sourceval case_fold : string -> string
Sourceval strip_accents : string -> string
Sourceval clean_text : ?remove_control:bool -> ?normalize_whitespace:bool -> string -> string
Sourceval split_words : string -> string list
Sourceval grapheme_count : string -> int
Sourceval is_valid_utf8 : string -> bool
Sourceval remove_emoji : string -> string