package saga

You can search for identifiers within the package.

in-package search v0.2.0

On This Page

Accessors
Token/Word/Char mappings
Operations

package saga

saga
- CHANGES
- README
- Library saga
  - Saga
    
    Either
    
    Unicode
    
    Models
    
    Normalizers
    
    Pre_tokenizers
    
    Processors
    
    Decoders
    
    Trainers
    
    Encoding
    
    Bpe
    
    Wordpiece
    
    Added_token
    
    Tokenizer
    
    Sampler
- Library saga.models
  - Saga_models
    
    Ngram
- Library saga.tokenizers
  - Saga_tokenizers
    
    Either
    
    Unicode
    
    Models
    
    Normalizers
    
    Pre_tokenizers
    
    Processors
    
    Decoders
    
    Trainers
    
    Encoding
    
    Bpe
    
    Builder
    
    Trainer
    
    Wordpiece
    
    Builder
    
    Trainer
    
    Added_token
    
    Tokenizer
- Sources
  - saga
    
    io.ml
    
    lm.ml
    
    saga.ml
    
    saga__.ml
    
    sampler.ml
  - saga.models
    
    ngram.ml
    
    saga_models.ml
    
    saga_models__.ml
  - saga.tokenizers
    
    bpe.ml
    
    decoders.ml
    
    encoding.ml
    
    models.ml
    
    normalizers.ml
    
    pre_tokenizers.ml
    
    processors.ml
    
    saga_tokenizers.ml
    
    saga_tokenizers__.ml
    
    trainers.ml
    
    unicode.ml
    
    wordpiece.ml

Legend:
Page
Library
Module
Module type
Parameter
Class
Class type
Source

Module `Saga_tokenizers.Encoding`Source

Encoding module - represents the output of a tokenizer

Sourcetype t

The main encoding type - abstract to users

Source

val create : 
  ids:int array ->
  type_ids:int array ->
  tokens:string array ->
  words:int option array ->
  offsets:(int * int) array ->
  special_tokens_mask:int array ->
  attention_mask:int array ->
  overflowing:t list ->
  sequence_ranges:(int, int * int) Hashtbl.t ->
  t

Create a new encoding - for internal use

Sourceval with_capacity : int -> t

Create an empty encoding with given capacity

Sourceval from_tokens : (int * string * (int * int)) list -> type_id:int -> t

Create encoding from tokens

Sourceval is_empty : t -> bool

Check if encoding is empty

Sourceval length : t -> int

Get the length of the encoding

Sourceval n_sequences : t -> int

Get the number of sequences in the encoding

Sourceval set_sequence_id : t -> int -> t

Set sequence id for the whole encoding

Accessors

Sourceval get_ids : t -> int array

Get IDs

Sourceval get_type_ids : t -> int array

Get type IDs

Sourceval set_type_ids : t -> int array -> t

Set type IDs

Sourceval get_tokens : t -> string array

Get tokens

Sourceval get_word_ids : t -> int option array

Get word IDs

Sourceval get_sequence_ids : t -> int option array

Get sequence IDs for each token

Sourceval get_offsets : t -> (int * int) array

Get offsets

Sourceval get_special_tokens_mask : t -> int array

Get special tokens mask

Sourceval get_attention_mask : t -> int array

Get attention mask

Sourceval get_overflowing : t -> t list

Get overflowing encodings

Sourceval set_overflowing : t -> t list -> t

Set overflowing encodings

Sourceval take_overflowing : t -> t * t list

Take overflowing encodings (removes them from encoding)

Token/Word/Char mappings

Sourceval token_to_sequence : t -> int -> int option

Get the sequence index containing the given token

Sourceval token_to_word : t -> int -> (int * int) option

Get the word containing the given token

Sourceval token_to_chars : t -> int -> (int * (int * int)) option

Get the character offsets of the given token

Sourceval word_to_tokens : t -> word:int -> sequence_id:int -> (int * int) option

Get the tokens corresponding to the given word

Sourceval word_to_chars : t -> word:int -> sequence_id:int -> (int * int) option

Get the character offsets of the given word

Sourceval char_to_token : t -> pos:int -> sequence_id:int -> int option

Get the token containing the given character position

Sourceval char_to_word : t -> pos:int -> sequence_id:int -> int option

Get the word containing the given character position

Operations

Sourcetype truncation_direction =

| Left
| Right

Truncation direction

Source

val truncate : 
  t ->
  max_length:int ->
  stride:int ->
  direction:truncation_direction ->
  t

Truncate the encoding

Sourceval merge : t list -> growing_offsets:bool -> t

Merge multiple encodings

Sourceval merge_with : t -> t -> growing_offsets:bool -> t

Merge with another encoding in place

Sourcetype padding_direction =

| Left
| Right

Padding direction

Source

val pad : 
  t ->
  target_length:int ->
  pad_id:int ->
  pad_type_id:int ->
  pad_token:string ->
  direction:padding_direction ->
  t

Pad the encoding

package saga

Module Saga_tokenizers.EncodingSource

Accessors

Token/Word/Char mappings

Operations

Module `Saga_tokenizers.Encoding`Source