package saga

  1. Overview
  2. Docs

Module Saga_tokenizers.TokenizerSource

Sourcetype t

Main tokenizer type.

Sourcetype padding_config = {
  1. direction : direction;
  2. pad_id : int;
  3. pad_type_id : int;
  4. pad_token : string;
  5. length : int option;
  6. pad_to_multiple_of : int option;
}

Record for padding config.

Sourcetype truncation_config = {
  1. max_length : int;
  2. stride : int;
  3. strategy : strategy;
  4. direction : direction;
}

Record for truncation config.

Sourceval create : model:Models.t -> t

Creation

Create with core model.

Sourceval from_file : string -> (t, exn) result

From JSON file with result.

Sourceval from_str : string -> (t, exn) result

From JSON string with result.

Sourceval from_pretrained : string -> ?revision:string -> ?token:string -> unit -> (t, exn) result

From pretrained with result and defaults.

Sourceval from_buffer : bytes -> (t, exn) result

From buffer with result.

Sourceval set_normalizer : t -> Normalizers.t option -> unit

Configuration

Set normalizer.

Sourceval get_normalizer : t -> Normalizers.t option

Get normalizer.

Sourceval set_pre_tokenizer : t -> Pre_tokenizers.t option -> unit

Set pre-tokenizer.

Sourceval get_pre_tokenizer : t -> Pre_tokenizers.t option

Get pre-tokenizer.

Sourceval set_post_processor : t -> Processors.t option -> unit

Set post-processor.

Sourceval get_post_processor : t -> Processors.t option

Get post-processor.

Sourceval set_decoder : t -> Decoders.t option -> unit

Set decoder.

Sourceval get_decoder : t -> Decoders.t option

Get decoder.

Sourceval set_model : t -> Models.t -> unit

Set model.

Sourceval get_model : t -> Models.t

Get model.

Sourceval enable_padding : t -> padding_config -> unit

Padding and Truncation

Enable padding with record config.

Sourceval no_padding : t -> unit

Disable padding.

Sourceval get_padding : t -> padding_config option

Get padding config.

Sourceval enable_truncation : t -> truncation_config -> unit

Enable truncation with record config.

Sourceval no_truncation : t -> unit

Disable truncation.

Sourceval get_truncation : t -> truncation_config option

Get truncation config.

Sourceval add_tokens : t -> (string, Added_token.t) Either.t list -> int

Vocabulary Management

Add tokens, return count added.

Sourceval add_special_tokens : t -> (string, Added_token.t) Either.t list -> int

Add special tokens.

Sourceval get_vocab : t -> ?with_added_tokens:bool -> unit -> (string * int) list

Get vocab list with default.

Sourceval get_vocab_size : t -> ?with_added_tokens:bool -> unit -> int

Get size with default.

Sourceval get_added_tokens_decoder : t -> (int * Added_token.t) list

Get added tokens.

Sourceval token_to_id : t -> string -> int option

Token to id.

Sourceval id_to_token : t -> int -> string option

Id to token.

Sourceval train : t -> files:string list -> ?trainer:Trainers.t -> unit -> unit

Training

Train from files.

Sourceval train_from_iterator : t -> string Seq.t -> ?trainer:Trainers.t -> ?length:int -> unit -> unit

Train from text sequence.

Sourceval encode : t -> sequence:(string, string list) Either.t -> ?pair:(string, string list) Either.t -> ?is_pretokenized:bool -> ?add_special_tokens:bool -> unit -> Encoding.t

Encoding and Decoding

Encode single or pair, allowing pretokenized lists.

Sourceval encode_batch : t -> input: ((string, string list) Either.t, (string, string list) Either.t * (string, string list) Either.t) Either.t list -> ?is_pretokenized:bool -> ?add_special_tokens:bool -> unit -> Encoding.t list

Batch encode with flexible inputs.

Sourceval decode : t -> int list -> ?skip_special_tokens:bool -> ?clean_up_tokenization_spaces:bool -> unit -> string

Decode with defaults.

Sourceval decode_batch : t -> int list list -> ?skip_special_tokens:bool -> ?clean_up_tokenization_spaces:bool -> unit -> string list

Batch decode with defaults.

Sourceval post_process : t -> encoding:Encoding.t -> ?pair:Encoding.t -> ?add_special_tokens:bool -> unit -> Encoding.t

Post-process manually.

Sourceval num_special_tokens_to_add : t -> is_pair:bool -> int

Number of specials.

Sourceval save : t -> path:string -> ?pretty:bool -> unit -> unit

Serialization

Save to file with pretty default.

Sourceval save_pretrained : t -> path:string -> unit

Save pretrained format.

Sourceval to_str : t -> ?pretty:bool -> unit -> string

To JSON string with default.