package camlpdf

  1. Overview
  2. Docs

Module Pdftext

Parsing fonts and extracting text from content streams and PDF strings

Data Types

type type3_glpyhs = {
  1. fontbbox : float * float * float * float;
  2. fontmatrix : Pdftransform.transform_matrix;
  3. charprocs : (string * Pdf.pdfobject) list;
  4. type3_resources : Pdf.pdfobject;
}
type simple_fonttype =
  1. | Type1
  2. | MMType1
  3. | Type3 of type3_glpyhs
  4. | Truetype
type fontfile =
  1. | FontFile of int
  2. | FontFile2 of int
  3. | FontFile3 of int
type fontdescriptor = {
  1. ascent : float;
  2. descent : float;
  3. avgwidth : float;
  4. maxwidth : float;
  5. flags : int;
  6. fontbbox : float * float * float * float;
  7. italicangle : float;
  8. capheight : float;
  9. xheight : float;
  10. stemv : float;
  11. fontfile : fontfile option;
  12. charset : string list option;
  13. tounicode : (int, string) Hashtbl.t option;
}
type differences = (string * int) list
type encoding =
  1. | ImplicitInFontFile
  2. | StandardEncoding
  3. | MacRomanEncoding
  4. | WinAnsiEncoding
  5. | MacExpertEncoding
  6. | CustomEncoding of encoding * differences
  7. | FillUndefinedWithStandard of encoding
type fontmetrics = float array
type simple_font = {
  1. fonttype : simple_fonttype;
  2. basefont : string;
  3. firstchar : int;
  4. lastchar : int;
  5. widths : int array;
  6. fontdescriptor : fontdescriptor option;
  7. fontmetrics : fontmetrics option;
  8. encoding : encoding;
}
type standard_font =
  1. | TimesRoman
  2. | TimesBold
  3. | TimesItalic
  4. | TimesBoldItalic
  5. | Helvetica
  6. | HelveticaBold
  7. | HelveticaOblique
  8. | HelveticaBoldOblique
  9. | Courier
  10. | CourierBold
  11. | CourierOblique
  12. | CourierBoldOblique
  13. | Symbol
  14. | ZapfDingbats
type cid_system_info = {
  1. registry : string;
  2. ordering : string;
  3. supplement : int;
}
type composite_CIDfont = {
  1. cid_system_info : cid_system_info;
  2. cid_basefont : string;
  3. cid_fontdescriptor : fontdescriptor;
  4. cid_widths : (int * float) list;
  5. cid_default_width : int;
}
type cmap_encoding =
  1. | Predefined of string
  2. | CMap of int
type font =
  1. | StandardFont of standard_font * encoding
  2. | SimpleFont of simple_font
  3. | CIDKeyedFont of string * composite_CIDfont * cmap_encoding

String representations of fonts

val string_of_standard_font : standard_font -> string

Returns a string such as "Times-Bold" for Pdftext.TimesBold etc.

val standard_font_of_name : string -> standard_font option

Parses a string such as "/Times-Bold" or "/TimesNewRoman,Bold" to Pdftext.TimesRomanBold etc.

val string_of_font : font -> string

A debug string for the whole font datatype.

Reading a Font

val read_font : Pdf.t -> Pdf.pdfobject -> font

Read a font from a given document and object

Writing a Font

val write_font : ?objnum:int -> Pdf.t -> font -> int

Write a font to a given document, returning the object number for the main font dictionary

Utility functions

val is_unicode : string -> bool

Is a PDF string UTF16be (i.e does it have a byte order marker at the beginning)?

val is_identity_h : font -> bool

Is a font Identity H?

val codepoints_of_utf8 : string -> int list

A list of unicode codepoints for a UTF8 string

val utf8_of_codepoints : int list -> string

A UTF8 string for a list of unicode codepoints

val codepoints_of_utf16be : string -> int list

A list of unicode codepoints for a UTF16BE string

val utf16be_of_codepoints : int list -> string

A UTF16BE string for a list of unicode codepoints (with BOM)

Text from strings outside page content

val utf8_of_pdfdocstring : string -> string

Take a pdf string (which will be either pdfdocencoding or UTF16BE) and return a string representing the same unicode codepoints in UTF8

val pdfdocstring_of_utf8 : string -> string

Take a UTF8 string and convert to pdfdocencoding (if no unicode-only characters are used) or UTF16BE (if they are))

val pdfdocstring_of_codepoints : int list -> string

Build a pdf string in pdfdocencoding (if no unicode-only characters are used) or UTF16BE (if they are)

val codepoints_of_pdfdocstring : string -> int list

Produce a list of unicode codepoints from a pdfdocencoding or UTF16BE pdf document string

val simplify_utf16be : string -> string

Remake a UTF16BE string into a PDFDocEncoding string if all characters are in PDFDocEncoding

Text from strings inside page content

type text_extractor

The type of text extractors.

val text_extractor_of_font : Pdf.t -> Pdf.pdfobject -> text_extractor

Build a text extractor from a document and font object

val text_extractor_of_font_real : font -> text_extractor

Build a text extractor from a document and a font

val codepoints_of_text : text_extractor -> string -> int list

Return a list of unicode points from a given extractor and string (for example from a Pdfpages.Op_Tj or Op_TJ operator).

val glyphnames_of_text : text_extractor -> string -> string list

Return a list of glyph names from a given extractor and string

Building text for strings inside page content

val charcode_extractor_of_font : ?debug:bool -> Pdf.t -> Pdf.pdfobject -> int -> int option

Return the character code for a given unicode codepoint, if it exists in the encoding and font object. If debug is set (default false) missing characters are reported to stderr.

val charcode_extractor_of_font_real : ?debug:bool -> font -> int -> int option

Return the character code for a given unicode codepoint, if it exists in the encoding and font. If debug is set (default false) missing characters are reported to stderr.

val table_of_encoding : encoding -> (int, string) Hashtbl.t

Table of all the entries in an encoding.

val reverse_table_of_encoding : encoding -> (string, int) Hashtbl.t

Reverse table of all the entries in an encoding.

val parse_tounicode : Pdf.t -> Pdf.pdfobject -> (int * string) list

Parse a /ToUnicode entry.