-
Notifications
You must be signed in to change notification settings - Fork 29
/
Copy pathpdftext.mli
198 lines (151 loc) · 5.88 KB
/
pdftext.mli
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
(** Parsing fonts and extracting text from content streams and PDF strings *)
(** {2 Data Types } *)
type type3_glpyhs =
{fontbbox : float * float * float * float;
fontmatrix : Pdftransform.transform_matrix;
charprocs : (string * Pdf.pdfobject) list;
type3_resources : Pdf.pdfobject}
type simple_fonttype =
| Type1
| MMType1
| Type3 of type3_glpyhs
| Truetype
type fontfile =
| FontFile of int
| FontFile2 of int
| FontFile3 of int
type fontdescriptor =
{ascent : float;
descent : float;
avgwidth : float;
maxwidth : float;
flags : int;
fontbbox: float * float * float * float;
italicangle : float;
capheight : float;
xheight : float;
stemv : float;
fontfile : fontfile option;
charset : string list option;
tounicode : (int, string) Hashtbl.t option}
type differences = (string * int) list
type encoding =
| ImplicitInFontFile
| StandardEncoding
| MacRomanEncoding
| WinAnsiEncoding
| MacExpertEncoding
| CustomEncoding of encoding * differences
| FillUndefinedWithStandard of encoding
type fontmetrics = float array (*r widths of glyphs 0..255 *)
type simple_font =
{fonttype : simple_fonttype;
basefont : string;
firstchar : int;
lastchar : int;
widths : int array;
fontdescriptor : fontdescriptor option;
fontmetrics : fontmetrics option;
encoding : encoding}
type standard_font =
| TimesRoman
| TimesBold
| TimesItalic
| TimesBoldItalic
| Helvetica
| HelveticaBold
| HelveticaOblique
| HelveticaBoldOblique
| Courier
| CourierBold
| CourierOblique
| CourierBoldOblique
| Symbol
| ZapfDingbats
type cid_system_info =
{registry : string;
ordering : string;
supplement : int}
type composite_CIDfont =
{cid_system_info : cid_system_info;
cid_basefont : string;
cid_fontdescriptor : fontdescriptor;
cid_widths : (int * float) list;
cid_default_width : int}
type cmap_encoding =
| Predefined of string
| CMap of int (* indirect reference to CMap stream *)
type font =
| StandardFont of standard_font * encoding
| SimpleFont of simple_font
| CIDKeyedFont of string * composite_CIDfont * cmap_encoding
(** {2 String representations of fonts } *)
(** Returns a string such as "Times-Bold" for Pdftext.TimesBold etc. *)
val string_of_standard_font : standard_font -> string
(** Parses a string such as "/Times-Bold" or "/TimesNewRoman,Bold" to Pdftext.TimesRomanBold etc. *)
val standard_font_of_name : string -> standard_font option
(** A debug string for the whole font datatype. *)
val string_of_font : font -> string
(** {2 Reading a Font} *)
(** Read a font from a given document and object *)
val read_font : Pdf.t -> Pdf.pdfobject -> font
(** {2 Writing a Font} *)
(** Write a font to a given document, returning the object number for the main
font dictionary *)
val write_font : ?objnum:int -> Pdf.t -> font -> int
(** {2 Utility functions} *)
(** Is a PDF string UTF16be (i.e does it have a byte order marker at the beginning)? *)
val is_unicode : string -> bool
(** Is a font Identity H? *)
val is_identity_h : font -> bool
(** A list of unicode codepoints for a UTF8 string *)
val codepoints_of_utf8 : string -> int list
(** A UTF8 string for a list of unicode codepoints *)
val utf8_of_codepoints : int list -> string
(** A list of unicode codepoints for a UTF16BE string *)
val codepoints_of_utf16be : string -> int list
(** A UTF16BE string for a list of unicode codepoints (with BOM) *)
val utf16be_of_codepoints : int list -> string
(** {2 Text from strings outside page content} *)
(** Take a pdf string (which will be either pdfdocencoding or UTF16BE) and
return a string representing the same unicode codepoints in UTF8 *)
val utf8_of_pdfdocstring : string -> string
(** Take a UTF8 string and convert to pdfdocencoding (if no unicode-only
characters are used) or UTF16BE (if they are)) *)
val pdfdocstring_of_utf8 : string -> string
(** Build a pdf string in pdfdocencoding (if no unicode-only characters are
used) or UTF16BE (if they are) *)
val pdfdocstring_of_codepoints : int list -> string
(** Produce a list of unicode codepoints from a pdfdocencoding or UTF16BE pdf
document string *)
val codepoints_of_pdfdocstring : string -> int list
(** Remake a UTF16BE string into a PDFDocEncoding string if all characters are
in PDFDocEncoding *)
val simplify_utf16be : string -> string
(** {2 Text from strings inside page content} *)
(** The type of text extractors. *)
type text_extractor
(** Build a text extractor from a document and font object *)
val text_extractor_of_font : Pdf.t -> Pdf.pdfobject -> text_extractor
(** Build a text extractor from a document and a font *)
val text_extractor_of_font_real : font -> text_extractor
(** Return a list of unicode points from a given extractor and string (for
example from a [Pdfpages.Op_Tj] or [Op_TJ] operator). *)
val codepoints_of_text : text_extractor -> string -> int list
(** Return a list of glyph names from a given extractor and string *)
val glyphnames_of_text : text_extractor -> string -> string list
(** {2 Building text for strings inside page content} *)
(** Return the character code for a given unicode codepoint, if it exists in
the encoding and font object. If [debug] is set (default false) missing
characters are reported to stderr. *)
val charcode_extractor_of_font : ?debug:bool -> Pdf.t -> Pdf.pdfobject -> (int -> int option)
(** Return the character code for a given unicode codepoint, if it exists in
the encoding and font. If [debug] is set (default false) missing characters are
reported to stderr. *)
val charcode_extractor_of_font_real : ?debug:bool -> font -> (int -> int option)
(** Table of all the entries in an encoding. *)
val table_of_encoding : encoding -> (int, string) Hashtbl.t
(** Reverse table of all the entries in an encoding. *)
val reverse_table_of_encoding : encoding -> (string, int) Hashtbl.t
(** Parse a [/ToUnicode] entry. *)
val parse_tounicode : Pdf.t -> Pdf.pdfobject -> (int * string) list