Library
Module
Module type
Parameter
Class
Class type
Parsing fonts and extracting text from content streams and PDF strings
type type3_glpyhs = {
fontbbox : float * float * float * float;
fontmatrix : Pdftransform.transform_matrix;
charprocs : (string * Pdf.pdfobject) list;
type3_resources : Pdf.pdfobject;
}
type fontdescriptor = {
ascent : float;
descent : float;
avgwidth : float;
maxwidth : float;
flags : int;
fontbbox : float * float * float * float;
italicangle : float;
capheight : float;
xheight : float;
stemv : float;
fontfile : fontfile option;
charset : string list option;
tounicode : (int, string) Hashtbl.t option;
}
type encoding =
| ImplicitInFontFile
| StandardEncoding
| MacRomanEncoding
| WinAnsiEncoding
| MacExpertEncoding
| CustomEncoding of encoding * differences
| FillUndefinedWithStandard of encoding
type simple_font = {
fonttype : simple_fonttype;
basefont : string;
firstchar : int;
lastchar : int;
widths : int array;
fontdescriptor : fontdescriptor option;
fontmetrics : fontmetrics option;
encoding : encoding;
}
type composite_CIDfont = {
cid_system_info : cid_system_info;
cid_basefont : string;
cid_fontdescriptor : fontdescriptor;
cid_widths : (int * float) list;
cid_default_width : int;
}
type font =
| StandardFont of standard_font * encoding
| SimpleFont of simple_font
| CIDKeyedFont of string * composite_CIDfont * cmap_encoding
val string_of_standard_font : standard_font -> string
Returns a string such as "Times-Bold" for Pdftext.TimesBold etc.
val standard_font_of_name : string -> standard_font option
Parses a string such as "/Times-Bold" or "/TimesNewRoman,Bold" to Pdftext.TimesRomanBold etc.
val string_of_font : font -> string
A debug string for the whole font datatype.
val read_font : Pdf.t -> Pdf.pdfobject -> font
Read a font from a given document and object
Write a font to a given document, returning the object number for the main font dictionary
Is a PDF string UTF16be (i.e does it have a byte order marker at the beginning)?
val is_identity_h : font -> bool
Is a font Identity H?
A UTF16BE string for a list of unicode codepoints (with BOM)
Take a pdf string (which will be either pdfdocencoding or UTF16BE) and return a string representing the same unicode codepoints in UTF8
Take a UTF8 string and convert to pdfdocencoding (if no unicode-only characters are used) or UTF16BE (if they are))
Build a pdf string in pdfdocencoding (if no unicode-only characters are used) or UTF16BE (if they are)
Produce a list of unicode codepoints from a pdfdocencoding or UTF16BE pdf document string
Remake a UTF16BE string into a PDFDocEncoding string if all characters are in PDFDocEncoding
val text_extractor_of_font : Pdf.t -> Pdf.pdfobject -> text_extractor
Build a text extractor from a document and font object
val text_extractor_of_font_real : font -> text_extractor
Build a text extractor from a document and a font
val codepoints_of_text : text_extractor -> string -> int list
Return a list of unicode points from a given extractor and string (for example from a Pdfpages.Op_Tj
or Op_TJ
operator).
val glyphnames_of_text : text_extractor -> string -> string list
Return a list of glyph names from a given extractor and string
val charcode_extractor_of_font :
?debug:bool ->
Pdf.t ->
Pdf.pdfobject ->
int ->
int option
Return the character code for a given unicode codepoint, if it exists in the encoding and font object. If debug
is set (default false) missing characters are reported to stderr.
val charcode_extractor_of_font_real : ?debug:bool -> font -> int -> int option
Return the character code for a given unicode codepoint, if it exists in the encoding and font. If debug
is set (default false) missing characters are reported to stderr.