Library
Module
Module type
Parameter
Class
Class type
Parsing fonts and extracting text from content streams and PDF strings
type type3_glpyhs = {
fontbbox : float * float * float * float;
fontmatrix : Pdftransform.transform_matrix;
charprocs : (string * Pdf.pdfobject) list;
type3_resources : Pdf.pdfobject;
}
type fontdescriptor = {
ascent : float;
descent : float;
avgwidth : float;
maxwidth : float;
flags : int;
fontbbox : float * float * float * float;
italicangle : float;
capheight : float;
xheight : float;
stemv : float;
fontfile : fontfile option;
charset : string list option;
tounicode : (int, string) Hashtbl.t option;
}
type encoding =
| ImplicitInFontFile
| StandardEncoding
| MacRomanEncoding
| WinAnsiEncoding
| MacExpertEncoding
| CustomEncoding of encoding * differences
| FillUndefinedWithStandard of encoding
type simple_font = {
fonttype : simple_fonttype;
basefont : string;
firstchar : int;
lastchar : int;
widths : int array;
fontdescriptor : fontdescriptor option;
fontmetrics : fontmetrics option;
encoding : encoding;
}
type composite_CIDfont = {
cid_system_info : cid_system_info;
cid_basefont : string;
cid_fontdescriptor : fontdescriptor;
cid_widths : (int * float) list;
cid_default_width : int;
}
type font =
| StandardFont of standard_font * encoding
| SimpleFont of simple_font
| CIDKeyedFont of string * composite_CIDfont * cmap_encoding
val string_of_standard_font : standard_font -> string
Returns a string such as "Times-Bold" for Pdftext.TimesBold etc.
val standard_font_of_name : string -> standard_font option
Parses a string such as "/Times-Bold" or "/TimesNewRoman,Bold" to Pdftext.TimesRomanBold etc.
val string_of_font : font -> string
A debug string for the whole font datatype.
val read_font : Pdf.t -> Pdf.pdfobject -> font
Read a font from a given document and object
Write a font to a given document, returning the object number for the main font dictionary
Is a PDF string UTF16be (i.e does it have a byte order marker at the beginning)?
val is_identity_h : font -> bool
Is a font Identity H?
A UTF16BE string for a list of unicode codepoints (with BOM)
Take a pdf string (which will be either pdfdocencoding or UTF16BE) and return a string representing the same unicode codepoints in UTF8
Take a UTF8 string and convert to pdfdocencoding (if no unicode-only characters are used) or UTF16BE (if they are))
Build a pdf string in pdfdocencoding (if no unicode-only characters are used) or UTF16BE (if they are)
Produce a list of unicode codepoints from a pdfdocencoding or UTF16BE pdf document string
Remake a UTF16BE string into a PDFDocEncoding string if all characters are in PDFDocEncoding
val text_extractor_of_font : Pdf.t -> Pdf.pdfobject -> text_extractor
Build a text extractor from a document and font object
val text_extractor_of_font_real : font -> text_extractor
Build a text extractor from a document and a font
val codepoints_of_text : text_extractor -> string -> int list
Return a list of unicode points from a given extractor and string (for example from a Pdfpages.Op_Tj
or Op_TJ
operator).
val glyphnames_of_text : text_extractor -> string -> string list
Return a list of glyph names from a given extractor and string
val charcode_extractor_of_font :
?debug:bool ->
Pdf.t ->
Pdf.pdfobject ->
int ->
int option
Return the character code for a given unicode codepoint, if it exists in the encoding and font object. If debug
is set (default false) missing characters are reported to stderr.
val charcode_extractor_of_font_real : ?debug:bool -> font -> int -> int option
Return the character code for a given unicode codepoint, if it exists in the encoding and font. If debug
is set (default false) missing characters are reported to stderr.
Reverse table of all the entries in an encoding.
val parse_tounicode : Pdf.t -> Pdf.pdfobject -> (int * string) list
Parse a /ToUnicode
entry.