package saga

  1. Overview
  2. Docs

Module Saga_tokenizers.ModelsSource

Tokenization models module.

Sourcetype token = {
  1. id : int;
  2. value : string;
  3. offsets : int * int;
}

Tokenization result

Sourcetype bpe_model = {
  1. vocab : (string, int) Hashtbl.t;
  2. merges : (string * string) list;
  3. cache_capacity : int;
  4. dropout : float option;
  5. unk_token : string option;
  6. continuing_subword_prefix : string option;
  7. end_of_word_suffix : string option;
  8. fuse_unk : bool;
  9. byte_fallback : bool;
}

Model configurations

Sourcetype wordpiece_model = {
  1. vocab : (string, int) Hashtbl.t;
  2. unk_token : string;
  3. max_input_chars_per_word : int;
}
Sourcetype wordlevel_model = {
  1. vocab : (string, int) Hashtbl.t;
  2. unk_token : string;
}
Sourcetype unigram_model = {
  1. vocab : (string * float) list;
}
Sourcetype t =
  1. | BPE of bpe_model
  2. | WordPiece of wordpiece_model
  3. | WordLevel of wordlevel_model
  4. | Unigram of unigram_model

Main model type

Constructors

Sourceval bpe : ?vocab:(string * int) list -> ?merges:(string * string) list -> ?cache_capacity:int -> ?dropout:float -> ?unk_token:string -> ?continuing_subword_prefix:string -> ?end_of_word_suffix:string -> ?fuse_unk:bool -> ?byte_fallback:bool -> ?ignore_merges:bool -> unit -> t

Create a BPE model

Sourceval wordpiece : ?vocab:(string * int) list -> ?unk_token:string -> ?continuing_subword_prefix:string -> ?max_input_chars_per_word:int -> unit -> t

Create a WordPiece model

Sourceval word_level : ?vocab:(string * int) list -> ?unk_token:string -> unit -> t

Create a WordLevel model

Sourceval unigram : ?vocab:(string * float) list -> ?unk_token:string -> ?byte_fallback:bool -> ?max_piece_length:int -> ?n_sub_iterations:int -> ?shrinking_factor:float -> unit -> t

Create a Unigram model

Sourceval chars : unit -> t

Create a character-level model

Sourceval regex : string -> t

Create a regex-based model

Sourceval from_file : vocab:string -> ?merges:string -> unit -> t

Load model from files

Operations

Sourceval tokenize : t -> string -> token list

Tokenize a string into tokens

Sourceval token_to_id : t -> string -> int option

Get the ID for a token

Sourceval id_to_token : t -> int -> string option

Get the token for an ID

Sourceval get_vocab : t -> (string * int) list

Get the vocabulary

Sourceval get_vocab_size : t -> int

Get the vocabulary size

Sourceval add_tokens : t -> string list -> int

Add tokens to the model's vocabulary. Returns number of tokens added.

Sourceval save : t -> folder:string -> ?prefix:string -> unit -> string list

Save the model to files

Serialization

Sourceval to_json : t -> Yojson.Basic.t
Sourceval of_json : Yojson.Basic.t -> t