package stk_xml

  1. Overview
  2. Docs

Source file xml.ml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
(*********************************************************************************)
(*                OCaml-Stk                                                      *)
(*                                                                               *)
(*    Copyright (C) 2023-2024 INRIA All rights reserved.                         *)
(*    Author: Maxence Guesdon, INRIA Saclay                                      *)
(*                                                                               *)
(*    This program is free software; you can redistribute it and/or modify       *)
(*    it under the terms of the GNU General Public License as                    *)
(*    published by the Free Software Foundation, version 3 of the License.       *)
(*                                                                               *)
(*    This program is distributed in the hope that it will be useful,            *)
(*    but WITHOUT ANY WARRANTY; without even the implied warranty of             *)
(*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the               *)
(*    GNU General Public License for more details.                               *)
(*                                                                               *)
(*    You should have received a copy of the GNU General Public                  *)
(*    License along with this program; if not, write to the Free Software        *)
(*    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA                   *)
(*    02111-1307  USA                                                            *)
(*                                                                               *)
(*    As a special exception, you have permission to link this program           *)
(*    with the OCaml compiler and distribute executables, as long as you         *)
(*    follow the requirements of the GNU GPL in regard to all of the             *)
(*    software in the executable aside from the OCaml compiler.                  *)
(*                                                                               *)
(*    Contact: Maxence.Guesdon@inria.fr                                          *)
(*                                                                               *)
(*********************************************************************************)

(** XML documents *)


module Log = (val (Log.create_src "xml"))
open Log

let empty_iri = Iri.of_string ""

module QName =
  struct
    type t = Iri.t * string
    let compare (iri1, s1) (iri2, s2) =
      match String.compare s1 s2 with
        0 -> Iri.compare iri1 iri2
      | n -> n

    let to_string (ns, ln) = Printf.sprintf "[%s]%s" (Iri.to_string ns) ln
    let pp ppf x = Format.pp_print_string ppf (to_string x)
  end

let qname_equal ?(no_ns=false) (ns1,n1) (ns2,n2) =
  match String.compare n1 n2 with
  | 0 -> no_ns || Iri.equal ns1 ns2
  | _ -> false

module Attributes = Map.Make(QName)

module P =
  struct
    module Attributes = Attributes
    type attr_value = string Types.with_loc_option
    type data = unit
    let compare_name = QName.compare
    let compare_attr_value (s1,_) (s2,_) = String.compare s1 s2
    let compare_data _ _ = 0
    let default_data () = ()
    let version_name =
      let iri = Iri.of_string "" in
      fun () -> (iri, "version")
    let default_version =  Xtmpl.Xml.P.default_version
    let default_attr_value () = "", None
    let pp_name ppf (iri, str) = Format.fprintf ppf "%a%s" Iri.pp iri str
    let pp_attr_value = Xtmpl.Xml.P.pp_attr_value
    let pp_attributes = None
  end
include (Xtmpl.Types.Make(P))

module TXml =
  struct
    type t = doc
    let compare = compare_doc
    let wrapper = None
    let transition = None
  end
module PXml = Stk.Props.Add_prop_type(TXml)
let mk_prop_xml = PXml.mk_prop ~default:(doc_empty())

let opt_att node ?(try_no_ns=true) x =
  match Attributes.find_opt x node.atts with
  | None when try_no_ns -> Attributes.find_opt (empty_iri, snd x) node.atts
  | x -> x

let get_att node ?try_no_ns iri def =
  match opt_att node ?try_no_ns iri with
  | None -> def
  | Some v -> v

let map_att node ?try_no_ns iri f def =
  match opt_att node ?try_no_ns iri with
  | None -> def
  | Some (x,_) -> try f x with _ -> def

let map_opt_att node ?try_no_ns iri f =
  match opt_att node ?try_no_ns iri with
  | None -> None
  | Some (x,_) -> try Some (f x) with _ -> None

let int_att node ?try_no_ns iri def =
  map_att node ?try_no_ns iri int_of_string def
let opt_int_att node ?try_no_ns iri =
  map_opt_att node ?try_no_ns iri int_of_string

let xhtml_ns_str = "http://www.w3.org/1999/xhtml"
let xhtml_ns = Iri.of_string xhtml_ns_str
let xhtml_ str = (xhtml_ns, str)
let xhtml_id = xhtml_ "id"
let xhtml_name = xhtml_ "name"
let xhtml_href = xhtml_ "href"
let xhtml_class = xhtml_ "class"
let xhtml_a = xhtml_ "a"

let node_id node =
  match opt_att node ~try_no_ns:true xhtml_id with
  | Some (a, _) -> Some a
  | None ->
      match opt_att node ~try_no_ns:true xhtml_name with
      | Some (a, _) -> Some a
      | None -> None

let mime_html = Ldp.Ct.to_mime (Ldp.Types.content_type_of_string "text/html")
let html_mime_types = [ Ldp.Ct.mime_xhtml ; mime_html ]

let mime_is_xml m =
  m = mime_html || m = Ldp.Ct.mime_xhtml
  || (match snd m with "xml" -> true | _ -> false)
  ||
    (let s = Ldp.Ct.mime_to_string m in
     let len = String.length s in
     len >= 4 && String.sub s (len - 4) 4 = "+xml")

module Of_xtmpl =
  struct
    module X = Xtmpl.Xml

    let map_name base att ns = function
    | ("xmlns",_)
    | ("","xmlns") -> (Iri.of_string "xmlns", "")
    | ("",ln) -> if att then (Iri.of_string "", ln) else (base, ln)
    | (n,ln) ->
        try
          let iri = Stk.Smap.find n ns in
          (iri, ln)
        with
          Not_found ->
            warn (fun m -> m "Unknown namespace %S, using base %S"
               n (Iri.to_string base)) ;
            (base, ln)

    let set_ns base ns attr =
      X.Name_map.fold
        (fun (n,ln) (str,_) (base, ns) ->
           match n, ln with
         | ("","xmlns") -> (Iri.of_string str, ns)
           | ("xmlns", ln) ->
               (base, Stk.Smap.add ln (Iri.of_string str) ns)
           | _ -> (base,ns)
        )
        attr (base, ns)

    let map_atts base ns (atts:X.attributes) =
      X.Name_map.fold
        (fun name (str,loc) acc ->
           let str = (str, loc) in
           Attributes.add (map_name base true ns name) str acc)
        atts Attributes.empty

    let map_proc_inst base ns (pi:X.proc_inst) =
      pi_ ?loc:pi.loc (map_name base false ns pi.app) pi.args

    let rec map_tree base ns = function
    | X.D c -> cdata ?loc:c.loc ~quoted:c.quoted c.text
    | X.C c -> comment ?loc:c.loc c.comment
    | X.PI p -> PI (map_proc_inst base ns p)
    | X.E n ->
        let (base, ns) = set_ns base ns n.atts in
        let loc = n.loc in
        let name = map_name base false ns n.name in
        let atts = map_atts base ns n.atts in
        let subs = map_trees base ns n.subs in
        node ?loc name ~atts subs

    and map_trees base ns (xmls:X.tree list) =
      List.map (map_tree base ns) xmls

    let map_prolog_misc base ns = function
    | X.PC c -> PC c
    | PPI pi -> PPI (map_proc_inst base ns pi)

    let map_prolog base ns (p:X.prolog) =
      let decl =
        match p.X.decl with
        | None -> None
        | Some d ->
            let atts = map_atts base ns d.atts in
            Some(xml_decl ?loc:d.loc atts)
      in
      let doctype = match p.X.doctype with
        | None -> None
        | Some d -> Some (doctype ?loc:d.loc (map_name base false ns d.name) d.args)
      in
      let misc = List.map (map_prolog_misc base ns) p.X.misc in
      prolog ?decl ?doctype misc

    let map_doc ?(base=xhtml_ns) (d:X.doc) =
      let prolog = map_prolog base Stk.Smap.empty d.prolog in
      let elements = map_trees base Stk.Smap.empty d.elements in
      doc prolog elements
  end

let doc_from_string ?base ?param str =
  Of_xtmpl.map_doc ?base (Xtmpl.Xml.doc_from_string ?param str)
let from_string ?(base=Iri.of_string "") ?param ?(ns=Stk.Smap.empty) str =
  Of_xtmpl.map_trees base ns (Xtmpl.Xml.from_string ?param str)


let split_text =
  let rec iter s len acc start pos =
    if pos >= len then
      let acc =
        if start < pos then
          (String.sub s start (pos-start)) :: acc
        else
          acc
      in
      List.rev acc
    else
      match String.get s pos with
      | ' '|'\t'|'\r' as c ->
          let acc =
            if start < pos then
              (String.sub s start (pos-start)) :: acc
            else
              acc
          in
          let acc = String.make 1 c :: acc in
          iter s len acc (pos+1) (pos+1)
      | '\n' ->
          let acc =
            if start < pos then
              (String.sub s start (pos-start)) :: acc
            else
              acc
          in
          let acc = "\n" :: acc in
          iter s len acc (pos+1) (pos+1)
      | c ->
          if start < pos then
            iter s len acc start (pos+1)
          else
            iter s len acc pos (pos+1)
  in
  fun text->
    let len = String.length text in
    iter text len [] 0 0

let normalize_xmls =
  let rec iter = function
  | E node ->
      let subs = List.flatten (List.map iter node.subs) in
      [ E { node with subs } ]
  | (PI _ | C _) as x -> [ x ]
  | D { text ; loc }  ->
      let str = Stk.Utf8.normalize text in
      let chunks = split_text str in
      let build_cdata (pos, acc) text =
        let loc, next_pos =
          match pos, text with
          | None, _ -> None, None
          | Some p, _ ->
             let p2 = { p with Lexing.pos_cnum = p.Lexing.pos_cnum + String.length text } in
             let loc = (p, p2) in
             let next_pos =
                match text with
                | "\n" ->
                    { p2 with Lexing.pos_lnum = p2.pos_lnum + 1;
                      pos_bol = p2.pos_bol + p2.pos_cnum ;
                      pos_cnum = 0 ;
                    }
                | _ -> p2
              in
              Some loc, Some next_pos
        in
        let acc = D { text ; loc ; quoted = false } :: acc in
        (next_pos, acc)
      in
      let pos = match loc with None -> None | Some (p,_) -> Some p in
      let (_, l) = List.fold_left build_cdata (pos, []) chunks in
      List.rev l
  in
  fun xmls -> List.flatten (List.map iter xmls)

let xhtml_base xmls base =
  let find str xmls =
    let pred = function
    | E { name } -> QName.compare name (xhtml_ str) = 0
    | _ -> false
    in
    List.find_opt pred xmls
  in
  match find "html" xmls with
  | Some (E { subs }) ->
      (
       match find "head" subs with
       | Some (E { subs }) ->
           (
            match find "base" subs with
            | None -> base
            | Some (E node) ->
                (
                 match opt_att node (Iri.of_string "", "href") with
                 | None -> base
                 | Some (str,loc) ->
                     try
                    let iri = Iri.of_string str in
                       Iri.resolve ~base iri
                     with
                       e ->
                         Log.err (fun m -> m "%a: %s" Types.pp_loc_option loc (Printexc.to_string e));
                         base
                )
            | _ -> base
           )
       | _ -> base
      )
  | _ -> base

let html_self_closing_elements =
  Xtmpl.Xml.SSet.of_list
    [ "area"; "base"; "br" ; "col" ; "embed"; "hr" ;
      "img"; "input"; "link" ; "meta" ; "param" ; "source" ;
      "track" ; "wbr" ;
      (* < html 5: *)
      "command" ; "keygen" ; "menuitem" ;
    ]