package saga

You can search for identifiers within the package.

in-package search v0.2.0

On This Page

Constructors
Operations
Serialization

package saga

saga
- CHANGES
- README
- Library saga
  - Saga
    
    Either
    
    Unicode
    
    Models
    
    Normalizers
    
    Pre_tokenizers
    
    Processors
    
    Decoders
    
    Trainers
    
    Encoding
    
    Bpe
    
    Wordpiece
    
    Added_token
    
    Tokenizer
    
    Sampler
- Library saga.models
  - Saga_models
    
    Ngram
- Library saga.tokenizers
  - Saga_tokenizers
    
    Either
    
    Unicode
    
    Models
    
    Normalizers
    
    Pre_tokenizers
    
    Processors
    
    Decoders
    
    Trainers
    
    Encoding
    
    Bpe
    
    Builder
    
    Trainer
    
    Wordpiece
    
    Builder
    
    Trainer
    
    Added_token
    
    Tokenizer
- Sources
  - saga
    
    io.ml
    
    lm.ml
    
    saga.ml
    
    saga__.ml
    
    sampler.ml
  - saga.models
    
    ngram.ml
    
    saga_models.ml
    
    saga_models__.ml
  - saga.tokenizers
    
    bpe.ml
    
    decoders.ml
    
    encoding.ml
    
    models.ml
    
    normalizers.ml
    
    pre_tokenizers.ml
    
    processors.ml
    
    saga_tokenizers.ml
    
    saga_tokenizers__.ml
    
    trainers.ml
    
    unicode.ml
    
    wordpiece.ml

Legend:
Page
Library
Module
Module type
Parameter
Class
Class type
Source

Module `Saga_tokenizers.Models`Source

Tokenization models module.

Sourcetype token = {

id : int;
value : string;
offsets : int * int;

}

Tokenization result

Sourcetype bpe_model = {

vocab : (string, int) Hashtbl.t;
merges : (string * string) list;
cache_capacity : int;
dropout : float option;
unk_token : string option;
continuing_subword_prefix : string option;
end_of_word_suffix : string option;
fuse_unk : bool;
byte_fallback : bool;

}

Model configurations

Sourcetype wordpiece_model = {

vocab : (string, int) Hashtbl.t;
unk_token : string;
max_input_chars_per_word : int;

}

Sourcetype wordlevel_model = {

vocab : (string, int) Hashtbl.t;
unk_token : string;

}

Sourcetype unigram_model = {

vocab : (string * float) list;

}

Sourcetype t =

| BPE of bpe_model
| WordPiece of wordpiece_model
| WordLevel of wordlevel_model
| Unigram of unigram_model

Main model type

Constructors

Source

val bpe : 
  ?vocab:(string * int) list ->
  ?merges:(string * string) list ->
  ?cache_capacity:int ->
  ?dropout:float ->
  ?unk_token:string ->
  ?continuing_subword_prefix:string ->
  ?end_of_word_suffix:string ->
  ?fuse_unk:bool ->
  ?byte_fallback:bool ->
  ?ignore_merges:bool ->
  unit ->
  t

Create a BPE model

Source

val wordpiece : 
  ?vocab:(string * int) list ->
  ?unk_token:string ->
  ?continuing_subword_prefix:string ->
  ?max_input_chars_per_word:int ->
  unit ->
  t

Create a WordPiece model

Sourceval word_level : ?vocab:(string * int) list -> ?unk_token:string -> unit -> t

Create a WordLevel model

Source

val unigram : 
  ?vocab:(string * float) list ->
  ?unk_token:string ->
  ?byte_fallback:bool ->
  ?max_piece_length:int ->
  ?n_sub_iterations:int ->
  ?shrinking_factor:float ->
  unit ->
  t

Create a Unigram model

Sourceval chars : unit -> t

Create a character-level model

Sourceval regex : string -> t

Create a regex-based model

Sourceval from_file : vocab:string -> ?merges:string -> unit -> t

Load model from files

Operations

Sourceval tokenize : t -> string -> token list

Tokenize a string into tokens

Sourceval token_to_id : t -> string -> int option

Get the ID for a token

Sourceval id_to_token : t -> int -> string option

Get the token for an ID

Sourceval get_vocab : t -> (string * int) list

Get the vocabulary

Sourceval get_vocab_size : t -> int

Get the vocabulary size

Sourceval add_tokens : t -> string list -> int

Add tokens to the model's vocabulary. Returns number of tokens added.

Sourceval save : t -> folder:string -> ?prefix:string -> unit -> string list

Save the model to files

Serialization

Sourceval to_json : t -> Yojson.Basic.t

Sourceval of_json : Yojson.Basic.t -> t

package saga

Module Saga_tokenizers.ModelsSource

Constructors

Operations

Serialization

Module `Saga_tokenizers.Models`Source