package saga

  1. Overview
  2. Docs
Text processing and NLP extensions for Nx

Install

dune-project
 Dependency

Authors

Maintainers

Sources

raven-1.0.0.alpha1.tbz
sha256=8e277ed56615d388bc69c4333e43d1acd112b5f2d5d352e2453aef223ff59867
sha512=369eda6df6b84b08f92c8957954d107058fb8d3d8374082e074b56f3a139351b3ae6e3a99f2d4a4a2930dd950fd609593467e502368a13ad6217b571382da28c

doc/saga.tokenizers/Saga_tokenizers/Unicode/index.html

Module Saga_tokenizers.UnicodeSource

Unicode utilities.

Sourcetype normalization =
  1. | NFC
  2. | NFD
  3. | NFKC
  4. | NFKD
Sourcetype char_category =
  1. | Letter
  2. | Number
  3. | Punctuation
  4. | Symbol
  5. | Whitespace
  6. | Control
  7. | Other
Sourceval categorize_char : Uchar.t -> char_category
Sourceval is_whitespace : Uchar.t -> bool
Sourceval is_punctuation : Uchar.t -> bool
Sourceval is_word_char : Uchar.t -> bool
Sourceval is_cjk : Uchar.t -> bool
Sourceval normalize : normalization -> string -> string
Sourceval case_fold : string -> string
Sourceval strip_accents : string -> string
Sourceval clean_text : ?remove_control:bool -> ?normalize_whitespace:bool -> string -> string
Sourceval split_words : string -> string list
Sourceval grapheme_count : string -> int
Sourceval is_valid_utf8 : string -> bool
Sourceval remove_emoji : string -> string