package saga

  1. Overview
  2. Docs

Module Saga_tokenizers.SpecialSource

Sourceval make : ?single_word:bool -> ?lstrip:bool -> ?rstrip:bool -> ?normalized:bool -> string -> special

make ?single_word ?lstrip ?rstrip ?normalized token creates a special token configuration.

All parameters default to appropriate values for special tokens:

  • single_word: false - can match partial words
  • lstrip: false - don't strip left whitespace
  • rstrip: false - don't strip right whitespace
  • normalized: false - special tokens not normalized
Sourceval pad : string -> special

pad token creates a padding token (e.g., "<pad>").

Sourceval unk : string -> special

unk token creates an unknown token (e.g., "<unk>").

Sourceval bos : string -> special

bos token creates a beginning-of-sequence token (e.g., "<s>").

Sourceval eos : string -> special

eos token creates an end-of-sequence token (e.g., "</s>").

Sourceval cls : string -> special

cls token creates a classification token (e.g., "[CLS]").

Sourceval sep : string -> special

sep token creates a separator token (e.g., "[SEP]").

Sourceval mask : string -> special

mask token creates a mask token (e.g., "[MASK]").