package saga

  1. Overview
  2. Docs

Module Saga_tokenizers.EncodingSource

Encoding module - represents the output of a tokenizer

Sourcetype t

The main encoding type - abstract to users

Sourceval create : ids:int array -> type_ids:int array -> tokens:string array -> words:int option array -> offsets:(int * int) array -> special_tokens_mask:int array -> attention_mask:int array -> overflowing:t list -> sequence_ranges:(int, int * int) Hashtbl.t -> t

Create a new encoding - for internal use

Sourceval with_capacity : int -> t

Create an empty encoding with given capacity

Sourceval from_tokens : (int * string * (int * int)) list -> type_id:int -> t

Create encoding from tokens

Sourceval is_empty : t -> bool

Check if encoding is empty

Sourceval length : t -> int

Get the length of the encoding

Sourceval n_sequences : t -> int

Get the number of sequences in the encoding

Sourceval set_sequence_id : t -> int -> t

Set sequence id for the whole encoding

Accessors

Sourceval get_ids : t -> int array

Get IDs

Sourceval get_type_ids : t -> int array

Get type IDs

Sourceval set_type_ids : t -> int array -> t

Set type IDs

Sourceval get_tokens : t -> string array

Get tokens

Sourceval get_word_ids : t -> int option array

Get word IDs

Sourceval get_sequence_ids : t -> int option array

Get sequence IDs for each token

Sourceval get_offsets : t -> (int * int) array

Get offsets

Sourceval get_special_tokens_mask : t -> int array

Get special tokens mask

Sourceval get_attention_mask : t -> int array

Get attention mask

Sourceval get_overflowing : t -> t list

Get overflowing encodings

Sourceval set_overflowing : t -> t list -> t

Set overflowing encodings

Sourceval take_overflowing : t -> t * t list

Take overflowing encodings (removes them from encoding)

Token/Word/Char mappings

Sourceval token_to_sequence : t -> int -> int option

Get the sequence index containing the given token

Sourceval token_to_word : t -> int -> (int * int) option

Get the word containing the given token

Sourceval token_to_chars : t -> int -> (int * (int * int)) option

Get the character offsets of the given token

Sourceval word_to_tokens : t -> word:int -> sequence_id:int -> (int * int) option

Get the tokens corresponding to the given word

Sourceval word_to_chars : t -> word:int -> sequence_id:int -> (int * int) option

Get the character offsets of the given word

Sourceval char_to_token : t -> pos:int -> sequence_id:int -> int option

Get the token containing the given character position

Sourceval char_to_word : t -> pos:int -> sequence_id:int -> int option

Get the word containing the given character position

Operations

Sourcetype truncation_direction =
  1. | Left
  2. | Right

Truncation direction

Sourceval truncate : t -> max_length:int -> stride:int -> direction:truncation_direction -> t

Truncate the encoding

Sourceval merge : t list -> growing_offsets:bool -> t

Merge multiple encodings

Sourceval merge_with : t -> t -> growing_offsets:bool -> t

Merge with another encoding in place

Sourcetype padding_direction =
  1. | Left
  2. | Right

Padding direction

Sourceval pad : t -> target_length:int -> pad_id:int -> pad_type_id:int -> pad_token:string -> direction:padding_direction -> t

Pad the encoding