package xmldiff

  1. Overview
  2. Docs
Legend:
Page
Library
Module
Module type
Parameter
Class
Class type
Source

Source file xmldiff.ml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
(*********************************************************************************)
(*                Xmldiff                                                        *)
(*                                                                               *)
(*    Copyright (C) 2014-2021 Institut National de Recherche en Informatique     *)
(*    et en Automatique. All rights reserved.                                    *)
(*                                                                               *)
(*    This program is free software; you can redistribute it and/or modify       *)
(*    it under the terms of the GNU Lesser General Public License version        *)
(*    3 as published by the Free Software Foundation.                            *)
(*                                                                               *)
(*    This program is distributed in the hope that it will be useful,            *)
(*    but WITHOUT ANY WARRANTY; without even the implied warranty of             *)
(*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *)
(*    GNU Library General Public License for more details.                       *)
(*                                                                               *)
(*    You should have received a copy of the GNU Lesser General Public           *)
(*    License along with this program; if not, write to the Free Software        *)
(*    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA                   *)
(*    02111-1307  USA                                                            *)
(*                                                                               *)
(*    Contact: Maxence.Guesdon@inria.fr                                          *)
(*                                                                               *)
(*                                                                               *)
(*********************************************************************************)

(** *)

let dbg_mode =
  match try Sys.getenv "XMLDIFF_DEBUG" with _ -> "" with
    "1" -> true
  | _ -> false

let dbg = if dbg_mode then prerr_endline else fun _ -> ()
let on_dbg f x = if dbg_mode then f x else ()

module Smap = Map.Make(String)
module Intmap = Map.Make
  (struct type t = int let compare (x:int) y = Stdlib.compare x y end)
module Intset = Set.Make
  (struct type t = int let compare (x:int) y = Stdlib.compare x y end)

module Nmap =
 Map.Make (
   struct
     type t = string * string
     let compare (p1, s1) (p2, s2) =
       match String.compare p1 p2 with
         0 -> String.compare s1 s2
       | n -> n
   end)

type name = Xmlm.name

let string_of_name = function
  "", s -> s
| s1, s2 -> s1 ^ ":" ^ s2

let atts_of_list l=
  List.fold_left
    (fun acc (name, v) -> Nmap.add name v acc)
      Nmap.empty l

type 'a xmlt = [
    `E of name * string Nmap.t * 'a list
  | `D of string ]
type xmltree = xmltree xmlt
type xmlnode = int option * xmlnode xmlt

type label = Node of string | Text of string
let compare_label l1 l2 =
  match l1, l2 with
    Node s1, Node s2 -> String.compare s1 s2
  | Text s1, Text s2 -> String.compare s1 s2
  | Node _, Text _  -> -1
  | Text _, Node _ -> 1

module Lmap = Map.Make(struct type t = label let compare = compare_label end)

type node = {
  number : int ;
  children : int array ;
  mutable parent : int option ;
  xml : xmltree ;
  weight : float ;
  hash : string ;
  label : label ;
  rank : int ;
  is_cut : bool ;
  mutable matched : int option ;
  }

type doc = {
  height: int ;
  w0 : float ;
  nodes : node array ;
 }

(*c==v=[File.file_of_string]=1.1====*)
let file_of_string ~file s =
  let oc = open_out file in
  output_string oc s;
  close_out oc
(*/c==v=[File.file_of_string]=1.1====*)

let string_of_atts map =
  let l =
    Nmap.fold
      (fun name s acc ->
         (Printf.sprintf "%s=%S" (string_of_name name) s) :: acc)
      map []
  in
  String.concat " " l

let label_of_xml = function
| `D s -> Text s
| `E (tag, _, _) -> Node (string_of_name tag)


let atts_of_map map =
  List.rev
    (Nmap.fold
     (fun name s acc -> (name, s) :: acc)
       map [])

let string_of_xml ?(cut=false) tree =
  let tree =
    if cut then
      match tree with
        `D _ -> tree
      | `E (name,atts,_) -> `E (name,atts,[])
    else
      tree
  in
  let b = Buffer.create 256 in
  let ns_prefix s = Some s in
  let output = Xmlm.make_output ~ns_prefix ~decl: false (`Buffer b) in
  let frag = function
  | `E (tag, atts, childs) ->
      let atts = atts_of_map atts in
      `El ((tag, atts), childs)
  | `D d -> `Data d
  in
  Xmlm.output_doc_tree frag output (None, tree);
  Buffer.contents b

let hash xml =
  let s =
    match xml with
      `D s -> "!" ^ s
    | `E _ -> "<" ^ (Marshal.to_string xml [])
  in
  Digest.string s


let short_label = function
  `E (("",s2), _, _) -> "<"^s2^">"
| `E ((s1,s2), _, _) -> "<"^s1^":"^s2^">"
| `D s ->
    let len = String.length s in
    let s = Printf.sprintf "%S" (String.sub s 0 (min 10 len)) in
    let len = String.length s in
    String.sub s 1 (len - 2)

let xmlnode_of_t t =
  let rec unfold_cut = function
    (`D s) as xml -> (None, xml)
  | `E (tag, atts, subs) ->
      (None, `E (tag, atts, List.map unfold_cut subs))
  in
  let len = Array.length t in
  let rec build n =
    let xml = t.(n).xml in
    match xml with
      `D s -> (Some n, `D s)
    | `E (tag,atts,children) ->
        let children =
          if t.(n).is_cut then
            List.map unfold_cut children
          else
            List.map build (Array.to_list t.(n).children)
        in
        (Some n, `E (tag, atts, children))
  in
  build (len-1)

let weight xml children =
  match xml with
    `D s -> 1. +. log (float (1 + String.length s))
  | `E _ -> List.fold_left (fun acc c -> c.weight +. acc) 1. children

let t_of_xml =
  let rec iter ?cut (n0, acc, acc_children, h) xml =
    let (label, subs, is_cut) =
      match xml with
      | `D _ -> (label_of_xml xml, [], false)
      | `E (tag, atts, l) ->
          match cut with
          | Some f when f tag atts l -> (Node (string_of_xml xml), [], true)
          | _ -> (label_of_xml xml, l, false)
    in
    let (n, acc, children, h_children) = List.fold_left (iter ?cut) (n0, acc, [], 0) subs in
    let children = List.rev children in
    List.iter (fun node -> node.parent <- Some n) children ;
    let hash = hash xml in
    let weight = weight xml children in
    let node =
      { number = n ;
        children = Array.of_list (List.map (fun node -> node.number) children) ;
        parent = None ;
        xml ; label ; hash ; weight ;
        rank = List.length acc_children ;
        is_cut ;
        matched = None ;
      }
    in
    (n+1, node :: acc, node :: acc_children, max h (h_children + 1))
  in
  fun ?cut xml ->
    let (_, l, _, h) = iter ?cut (0, [], [], 0) xml in
    let t = Array.of_list l in
    Array.sort (fun n1 n2 -> n1.number - n2.number) t;
    Array.iteri (fun i node ->
       (*
       prerr_endline (Printf.sprintf "i=%d, node.number=%d, parent=%s, xml=%s"
        i node.number
          (match node.parent with None -> "" | Some n -> string_of_int n)
          (short_label node.xml)
       );
       *)
       assert (i = node.number)
    ) t;
    { height = h;
      nodes =  t;
      w0 = t.(Array.length t - 1).weight ;
    }

type operation =
  | Replace of node * int
  | Move of int * int * int * int (* node * parent in t1 * new parent * rank *)
  | MoveRank of int * int * int (* node * parent * rank *)
  | Insert of node * int * int (* Insert(node,i,rank) insert tree from t2 as nth child of i *)
  | Delete of node (* delete tree from t1 *)
  | Edit of node * node (* change label of node from t1 to label of node from t2 *)

type actions = operation list

type patch_path =
  Path_cdata of int
| Path_node of Xmlm.name * int * patch_path option

type position = [ `FirstChild | `After]
type patch_operation =
| PInsert of xmltree * position
| PDelete
| PUpdateCData of string
| PUpdateNode of Xmlm.name * string Nmap.t
| PReplace of xmltree
| PMove of patch_path * position

type patch = (patch_path * patch_operation) list

let rec xml_of_source s_source source =
 try
    let ns s = Some s in
    let input = Xmlm.make_input ~ns ~enc: (Some `UTF_8) source in
    let el (tag, atts) childs =
      let atts = List.fold_left
        (fun map (name, v) -> Nmap.add name v map) Nmap.empty atts
      in
      `E (tag, atts, childs)
    in
    let data d = `D d in
    let (_, tree) = Xmlm.input_doc_tree ~el ~data input in
    tree
  with
    Xmlm.Error ((line, col), error) ->
      let msg = Printf.sprintf "%sLine %d, column %d: %s"
        s_source line col (Xmlm.error_message error)
      in
      failwith msg
  | Invalid_argument e ->
      let msg = Printf.sprintf "%sInvalid_argumen(%s)" s_source e in
      failwith msg

and xml_of_string s =
  xml_of_source s (`String (0, s))

let xml_of_file file =
  let ic = open_in file in
  try
    let xml = xml_of_source
      (Printf.sprintf "File %S, " file) (`Channel ic)
    in
    close_in ic;
    xml
  with
    e ->
      close_in ic;
      raise e

let dot_of_t t =
  let b = Buffer.create 256 in
  let p b = Printf.bprintf b in
  p b "digraph g {\nrankdir=TB;\nordering=out;\n";
  Array.iter
    (fun node ->
       p b "\"N%d\" [ label=\"%d: %s[%d]\", fontcolor=black ];\n"
         node.number node.number (short_label node.xml) node.rank;
       Array.iter (fun i -> p b "\"N%d\" -> \"N%d\";\n" node.number i) node.children ;
    )
    t.nodes;
  p b "}\n";
  Buffer.contents b

let dot_of_xmlnode t =
  let b = Buffer.create 256 in
  let p b = Printf.bprintf b in
  p b "digraph g {\nrankdir=TB;\nordering=out;\n";
  let string_of_id = function None -> "_" | Some n -> string_of_int n in
  let cpt = ref 0 in
  let rec iter parent rank (id, xml) =
    incr cpt;
    let n = !cpt in
    p b "\"N%d\" [ label=\"%s: %s[%d]\", fontcolor=black ];\n"
      n  (string_of_id id) (short_label xml) rank;
    (
     match xml with
       `D _ -> ()
     | `E (_,_,subs) ->
         List.iteri (fun i child ->
           iter (Some n) i child ;
         )
           subs
    );
    match parent with
      None -> ()
    | Some parent ->  p b "\"N%d\" -> \"N%d\";\n" parent n
  in
  iter None 0 t ;
  p b "}\n";
  Buffer.contents b

let dot_of_matches t1 t2 =
  let b = Buffer.create 256 in
  let p b = Printf.bprintf b in
  p b "digraph g {\nrankdir=TB;\nordering=out;\n";
  p b "subgraph cluster_2 {\n";
  Array.iter
    (fun node ->
       p b "\"T%d\" [ label=\"%d: %s[%d]\", fontcolor=black ];\n"
         node.number node.number (short_label node.xml) node.rank;
       Array.iter (fun i -> p b "\"T%d\" -> \"T%d\";\n" node.number i) node.children ;
    )
    t2.nodes;
  p b "}\n";
  p b "subgraph cluster_1 {\n";
  Array.iter
    (fun node ->
       p b "\"S%d\" [ label=\"%d: %s[%d]\", fontcolor=black ];\n"
         node.number node.number (short_label node.xml) node.rank;
       Array.iter (fun i -> p b "\"S%d\" -> \"S%d\";\n" node.number i) node.children ;
       match node.matched with
         None -> ()
       | Some j -> p b "S%d -> T%d [style=\"dashed\"];\n" node.number j
    )
    t1.nodes;
  p b "}\n";
  p b "}\n";
  Buffer.contents b

let string_of_action = function
| Replace (n2, i) -> Printf.sprintf "Replace (%d, %d): %s" n2.number i (string_of_xml ~cut:true n2.xml)
| Move (i, parent, new_parent, rank) -> Printf.sprintf "Move(%d,%d,%d,%d)" i parent new_parent rank
| MoveRank (i, new_parent, rank) -> Printf.sprintf "MoveRank(%d,%d,%d)" i new_parent rank
| Insert (n2, i, rank) -> Printf.sprintf "Insert (%d, %d, %d): %s" n2.number i rank (string_of_xml ~cut:true n2.xml)
| Delete n1 -> Printf.sprintf "Delete(%d): %s" n1.number (string_of_xml ~cut: true n1.xml)
| Edit (n1, n2) -> Printf.sprintf "Edit(%d,%d): %s -> %s" n1.number n2.number
  (string_of_xml ~cut: true n1.xml) (string_of_xml ~cut: true n2.xml)

let have_matching_parents nodes1 n1 n2 =
  match n1.parent, n2.parent with
  | None, None -> true
  | None, _
  | _, None -> false
  | Some p1, Some p2 -> nodes1.(p1).matched = Some p2

let matching_parent nodes n =
  match n.parent with
    None -> None
  | Some p -> nodes.(p).matched

let add_edit_action acc n1 n2 =
  match n1.xml, n2.xml with
    `E _, `D _
  | `D _, `E _ -> assert false
  | `D s1, `D s2 -> Edit (n1, n2) :: acc
  | `E (tag1, atts1, _), `E (tag2, atts2, _) ->
      match n1.is_cut, n2.is_cut with
        true, _
      | _, true -> Replace (n2, n1.number) :: acc
      | false, false ->
          begin
            if tag1 = tag2 && Nmap.equal (=) atts1 atts2 then
              acc
            else
              Edit (n1, n2) :: acc
          end

(* FIXME: remove rank accumulator *)
let make_actions t1 t2 =
  let nodes1 = t1.nodes in
  let nodes2 = t2.nodes in
  let rec f (acc, rank) i =
    let n1 = nodes1.(i) in
    match n1.matched with
      None ->
        let (acc, _) = Array.fold_left f (acc, 0) n1.children in
        ((Delete n1) :: acc, rank + 1)
    | Some j ->
        let n2 = nodes2.(j) in
        let matching_parents = have_matching_parents nodes1 n1 n2 in
        let (deleted, acc) =
          if matching_parents then
            if n1.rank = n2.rank then
              (false, acc)
            else
              (
               let new_parent =
                 match n1.parent with None -> assert false | Some i -> i
               in
               (false, MoveRank(n1.number, new_parent, n2.rank) :: acc)
              )
          else
            (
             match matching_parent nodes2 n2 with
               None ->
                 dbg (Printf.sprintf
                  "make_actions: missing matching parent, j=%d, parent=%d"
                    j
                    (match nodes2.(j).parent with None -> -1 | Some n -> n)
                 );
                 (*assert false*)
                 (true, (Delete n1 :: acc))
             | Some new_parent ->
                 let parent =
                   match n2.parent with None -> assert false | Some i -> i
                 in
                 (false, (Move(n1.number, parent, new_parent, n2.rank)) :: acc)
            )
        in
        let acc =
          if deleted then
            acc
          else
            if n1.hash = n2.hash then
              acc
            else
              (
               let acc = add_edit_action acc n1 n2 in
               let (acc, _) = Array.fold_left f (acc, 0) n1.children in
               acc
              )
        in
        (acc, rank + 1)
  in
  let (actions, _) = f ([], 0) (Array.length nodes1 - 1) in
  (* note: we should not have a node not matched in t2 with matched descendants *)
  let rec g acc j =
    let n2 = nodes2.(j) in
    match n2.matched with
      None ->
        dbg (Printf.sprintf "Insert %d which is not matched "n2.number);
        let new_parent =
          match matching_parent nodes2 n2 with
            None ->
              dbg (Printf.sprintf "no matching parent for t2.(%d)" n2.number);
              assert false
          | Some i -> i
        in
        (Insert (n2, new_parent, n2.rank)) :: acc
    | Some _ ->
        Array.fold_left g acc n2.children
  in
  let actions = g actions (Array.length nodes2 - 1) in
  actions

let sort_actions =
  let pred a1 a2 =
    match a1, a2 with
    | Delete i1, Delete i2 -> i2.number - i1.number (* higher node first *)
    | Delete _, _ -> 1
    | _, Delete _ -> -1
    | Edit _, Edit _ -> 0
    | Edit _, _ -> -1
    | _, Edit _ -> 1
    | Replace _, Replace _ -> 0
    | Replace _, _ -> -1
    | _, Replace _ -> 1
    | MoveRank(_,_,rank1), Move(_,_,_,rank2)
    | Move(_,_,_,rank1), MoveRank(_,_,rank2)
    | Move (_,_,_,rank1), Move (_,_,_,rank2)
    | MoveRank(_,_,rank1), MoveRank(_,_,rank2)
    | MoveRank(_,_,rank1), Insert(_,_,rank2)
    | Insert(_,_,rank1), MoveRank(_,_,rank2)
    | Move (_,_,_,rank1), Insert (_,_,rank2)
    | Insert (_,_,rank1), Move (_,_,_,rank2)
    | Insert (_,_,rank1), Insert (_,_,rank2) ->
        rank1 - rank2
  in
  List.sort pred

let build_hash_map =
  let add node map =
    let l =
      try Smap.find node.hash map
      with Not_found -> []
    in
    Smap.add node.hash (node.number :: l) map

  in
  fun t -> Array.fold_right add t.nodes Smap.empty

let rec get_nth_parent t i level =
  match t.nodes.(i).parent with
    None -> None
  | Some p ->
      if level <= 1 then
        Some p
      else
        get_nth_parent t p (level-1)

let d_of_node t i =
  1. +. (float t.height) *. t.nodes.(i).weight /. t.w0

let rec match_nodes ?(fail=true) ?(with_subs=false) t1 t2 i j =
  dbg (Printf.sprintf "matching %d -> %d [with_subs=%B]" i j with_subs);
  let node1 = t1.nodes.(i) in
  match node1.matched with
  | Some j2 when j <> j2 ->
      dbg (Printf.sprintf "t1.(%d) already matched to t2.(%d)" i j2);
      if fail then assert false
  | _ ->
      let node2 = t2.nodes.(j) in
      match node2.matched with
        Some i2 when i <> i2 ->
          dbg (Printf.sprintf "t2.(%d) already matched to t1.(%d)" j i2);
          if fail then assert false
      | _ ->
          node1.matched <- Some j;
          node2.matched <- Some i;
          if with_subs then
            begin
              let ch_i = node1.children in
              let ch_j = node2.children in
              for x = 0 to Array.length ch_i - 1 do
                match_nodes ~with_subs: true t1 t2 ch_i.(x) ch_j.(x)
              done
    end

let match_ancestors t1 t2 i j =
  let max_level = int_of_float (d_of_node t2 j) in
  dbg (Printf.sprintf "match_ancestors: i=%d, j=%d, max_level=%d" i j max_level);
  let rec iter i j level =
    if level > max_level then
      ()
    else
      match t1.nodes.(i).parent, t2.nodes.(j).parent with
        Some p1, Some p2
          when t1.nodes.(p1).label = t2.nodes.(p2).label ->
          match_nodes ~fail: false t1 t2 p1 p2;
          iter p1 p2 (level + 1)
      | _ -> ()
  in
  iter i j 1

let min_list p v l =
  let rec iter acc = function
    [] -> acc
  | h :: q -> if p h < acc then iter h q else iter acc q
  in
  iter v l

let best_candidate t1 t2 j cands =
  dbg ("best_candidates "^(String.concat ", " (List.map string_of_int cands)));
  let d = d_of_node t2 j in
  let map_parent acc = function
    (i, None) -> acc
  | (i, Some p) -> (i, t1.nodes.(p).parent) :: acc
  in
  let rec find level parent_j acc = function
    [] -> iter (level + 1) acc
  | (i, None) :: q -> find level parent_j acc q
  | (i, Some p) :: q ->
      match t1.nodes.(p).matched with
        Some j when j = parent_j ->
          Some i
      | _ ->
          find level parent_j ((i, Some p)::acc) q
  and iter level cands =
    if float level < d then
      begin
        let cands = List.rev (List.fold_left map_parent [] cands) in
        match get_nth_parent t2 j level with
          None -> None
        | Some parent_j ->
            find level parent_j [] cands
      end
    else
      None
  in
  iter 1 (List.map (fun i -> (i, Some i)) cands)

let candidates hash_t1 t2 j =
  try Smap.find t2.nodes.(j).hash hash_t1 with Not_found -> []

let match_candidate hash_t1 t1 t2 j =
  let candidates = candidates hash_t1 t2 j in
  let pred i = t1.nodes.(i).matched = None in
  match List.filter pred candidates with
    [] -> None
  | [i] -> Some i
  | l -> best_candidate t1 t2 j l

let (+=) map (k, v) =
  let x = try Intmap.find k map with Not_found -> 0. in
  Intmap.add k (v +. x) map

let match_uniquely_labeled =
  let map_of_t nodes t f map =
    Array.fold_left
      (fun map i ->
         let node = nodes.(i) in
         match node.matched with
           Some _ -> map
         | None ->
             let label = node.label in
             let x = try Lmap.find label map with Not_found -> ([], []) in
             let x = f i x in
             Lmap.add label x map
      )
      map t
  in
  fun t1 t2 li lj ->
    let map = map_of_t t1.nodes li (fun i (l1,l2) -> (i :: l1, l2)) Lmap.empty in
    let map = map_of_t t2.nodes lj (fun j (l1,l2) -> (l1, j :: l2)) map in
    Lmap.iter
      (fun _ -> function
         | [ i ], [ j ] -> match_nodes t1 t2 i j
         | _ -> ())
      map

let match_uniquely_labeled_children =
  let do_match t1 t2 j =
    let nj = t2.nodes.(j) in
    match nj.matched with
      None -> ()
    | Some i ->
        let children_i = t1.nodes.(i).children in
        let children_j = nj.children in
        match_uniquely_labeled t1 t2 children_i children_j
  in
  fun t1 t2 ->
    for j = Array.length t2.nodes -1 downto 0 do
      do_match t1 t2 j
    done

let run_phase4 t1 t2 =
  let f j node =
    match node.matched with
      Some _ -> ()
    | None ->
        let parents = Array.fold_left
          (fun acc jc ->
             match t2.nodes.(jc).matched with
               None -> acc
             | Some i ->
                 dbg (Printf.sprintf "%d has a child %d matched to %d" j jc i);
                 match t1.nodes.(i).parent with
                 | Some p when t1.nodes.(p).matched = None ->
                     dbg (Printf.sprintf "%d has a non-matched parent %d" i p);
                     acc += (p, t1.nodes.(i).weight)
                 | Some p ->
                     dbg (Printf.sprintf "[j=%d] i=%d has a parent %d already matched" j i p);
                     acc
                 | None -> acc
          )
            Intmap.empty node.children
        in
        let (parent, _) = Intmap.fold
          (fun p w ((acc_parent, acc_w) as acc) ->
             if w > acc_w then (p, w) else acc)
            parents (-1, -1.0)
        in
        if parent >= 0 then match_nodes t1 t2 parent j
  in
  Array.iteri f t2.nodes;
  match_uniquely_labeled_children t1 t2

let order_by_weight n1 n2 =
  match Stdlib.compare n2.weight n1.weight with
    0 -> Stdlib.compare n1.rank n2.rank
  | n -> n

let compute t1 t2 =
  let weight_queue = Queue.create () in
  let root2 = Array.length t2.nodes - 1 in
  let hash_t1 = build_hash_map t1 in

  (* make roots match orelse we'll have problems *)
  match_nodes t1 t2 (Array.length t1.nodes - 1) root2 ;
  let queue_nodes children =
    let t = Array.map (Array.get t2.nodes) children in
    Array.sort order_by_weight t;
    Array.iteri (fun _ n ->
       (*prerr_endline (Printf.sprintf "queuing %d" n.number);*)
       Queue.add n.number weight_queue)
      t
  in
  queue_nodes t2.nodes.(root2).children ;

  while not (Queue.is_empty weight_queue) do
    let j = Queue.pop weight_queue in
    (*prerr_endline (Printf.sprintf "trying to match %d" j);*)
    match t2.nodes.(j).matched with
      Some _ -> ()
    | None ->
        match match_candidate hash_t1 t1 t2 j with
          Some i ->
            match_nodes ~with_subs: true t1 t2 i j;
            match_ancestors t1 t2 i j
        | None ->
            queue_nodes t2.nodes.(j).children
  done;

  run_phase4 t1 t2 ;
  on_dbg (fun () -> file_of_string ~file:"/tmp/matches.dot" (dot_of_matches t1 t2)) ();
  sort_actions (make_actions t1 t2)

type cur_path = N of Xmlm.name | CData
module Cur_path = Map.Make (struct type t = cur_path let compare = Stdlib.compare end)
let cur_path_get cp map =
  try Cur_path.find cp map
  with Not_found -> 0

let cur_path_inc cp map =
  let n = cur_path_get cp map in
  Cur_path.add cp (n+1) map

let patch_path_of_cur_path_list =
  let iter (cp, n) acc =
    match acc, cp with
      (None, CData) -> Some (Path_cdata n)
    | (Some _, CData) -> assert false
    | (_, N name) -> Some (Path_node (name, n, acc))
  in
  fun l ->
    match List.fold_right iter l None with
      None -> assert false
    | Some p -> p


let rec string_of_path = function
  Path_cdata n -> "CData("^(string_of_int n)^")"
| Path_node (name, n, next) ->
    let s = (string_of_name name)^"("^(string_of_int n)^")" in
    match next with
      None -> s
    | Some p -> s^"/"^(string_of_path p)

let path_of_id =
  let cp_of_xml = function
    `D s -> CData
  | `E (name,_,_) -> N name
  in
  let rec forward to_move xmlnode path cur_path ~skip n  = function
  | (Some i, _) :: q when i = skip ->
      forward to_move xmlnode path cur_path ~skip n q
  | [] ->
      let b = Buffer.create 256 in
      Buffer.add_string b (string_of_path (patch_path_of_cur_path_list (List.rev path)));
      Buffer.add_char b '{';
      Cur_path.iter (fun k n -> Printf.bprintf b "%s->%d,"
         (match k with N name -> string_of_name name | CData -> "cdata") n) cur_path ;
      Buffer.add_char b '}';
      on_dbg (fun () -> file_of_string ~file: "/tmp/xmldiff_state.dot" (dot_of_xmlnode xmlnode)) ();
      failwith (Printf.sprintf "Invalid rank: %d element missing\npath: %s" (n+1) (Buffer.contents b))

  | (Some id, xml) :: q when Intset.mem id to_move ->
      let cur_path =
        let cp = cp_of_xml xml in
        cur_path_inc cp cur_path
      in
      forward to_move xmlnode path cur_path ~skip n q

  | (_, xml) :: _ when n = 0 ->
      let cp = cp_of_xml xml in
      (cp, cur_path_get cp cur_path) :: path

  | (id, xml) :: q ->
      let cur_path =
        let cp = cp_of_xml xml in
        cur_path_inc cp cur_path
      in
      (*dbg ("forward: id="^(match id with None -> "None" | Some n -> string_of_int n));*)
      forward to_move xmlnode path cur_path ~skip (n-1) q
  in

  let rec iter to_move xmlnode ~rank ~skip i path cur_path = function
  | (Some j, xml) when i = j ->
      begin
        let cp = cp_of_xml xml in
        let path = (cp, cur_path_get cp cur_path) :: path in
        let (path, pos) =
          match rank with
            None -> (path, `After)
          | Some 0 -> (path, `FirstChild)
          | Some n ->
              match xml with
                `D _ -> assert false
              | `E (_,_,subs) -> (forward to_move xmlnode path Cur_path.empty ~skip (n-1) subs, `After)
        in
        (patch_path_of_cur_path_list (List.rev path), pos)
      end
(*  | (Some j, _) when j < i -> raise Not_found*)
  | (_, `D _) -> raise Not_found
  | (_, `E (name, atts, subs)) ->
      (* let's go down after adding cur_path to path *)
      let cpt = cur_path_get (N name) cur_path in
      let path = (N name, cpt) :: path in
      iter_list to_move xmlnode ~rank ~skip i path Cur_path.empty subs

  and iter_list to_move xmlnode ~rank ~skip i path cur_path = function
    [] -> raise Not_found
  | h :: q ->
    try iter to_move xmlnode ~rank ~skip i path cur_path h
    with Not_found ->
      let cur_path =
        let cp = cp_of_xml (snd h) in
        cur_path_inc cp cur_path
      in
      iter_list to_move xmlnode ~rank ~skip i path cur_path q
  in
  (* the skip parameter is used to not take into account the node
     we are moving under its parent. *)
  fun to_move xmlnode ?rank ?(skip=(-1)) i ->
    try iter to_move xmlnode ~rank ~skip i [] Cur_path.empty xmlnode
    with Not_found ->
      let msg = "Id "^(string_of_int i)^" not found" in
      failwith msg

let rec xmlnode_of_xmltree = function
  `D s -> (None, `D s)
| `E (name,atts,subs) ->
    (None, `E (name,atts, List.map xmlnode_of_xmltree subs))

let string_of_position = function
  `FirstChild -> "FirstChild"
| `After -> "After"

let string_of_patch_operation (path, op) =
  match op with
  | PReplace xmltree ->
      "REPLACE("^(string_of_path path)^", "^(string_of_xml ~cut:true xmltree)^")"
  | PInsert (xmltree, pos) ->
      "INSERT("^(string_of_path path)^", "^(string_of_xml ~cut: true xmltree)^", "^(string_of_position pos)^")"
  | PDelete ->
      "DELETE("^(string_of_path path)^")"
  | PUpdateCData s ->
      Printf.sprintf "UPDATE_CDATA(%s, %S)" (string_of_path path) s
  | PUpdateNode (name, atts) ->
      Printf.sprintf "UPDATE_NODE(%s, %S, _)" (string_of_path path) (string_of_name name)
  | PMove (newpath, pos) ->
      Printf.sprintf "MOVE(%s, %s, %s)" (string_of_path path) (string_of_path newpath) (string_of_position pos)

let string_of_patch l =
  String.concat "\n" (List.map string_of_patch_operation l)

let remove_xmlnode t path =
  let rec iter xmls path =
    match xmls, path with
      ((x, `D _) as xml):: q, Path_cdata 0 -> (xml, q)
    | (x, `D s) :: q, Path_cdata n ->
       let (removed, xmls) = iter q (Path_cdata (n-1)) in
       (removed, (x, `D s) :: xmls)
    | ((x, `E (name,atts,subs) as xml) :: q, Path_node (name2, n, next)) when name = name2 ->
        if n = 0 then
          (match next with
             None -> (xml, q)
           | Some p ->
               let (removed, xmls) = iter subs p in
               (removed, [x, `E (name, atts, xmls)] @ q)
          )
        else
          (
           let (removed, xmls) = iter q (Path_node (name2, n-1, next)) in
           (removed, xml :: xmls)
          )
    | xml :: q, p ->
       let (removed, xmls) = iter q p in
       (removed, xml :: xmls)
    | [], _ -> assert false
  in
  match iter [t] path with
    removed, [t] -> (removed, t)
  | _ -> assert false

let insert_xmlnode t node path pos =
  let rec iter xmls path =
    match xmls, path with
      ((x, `D _) as xml):: q, Path_cdata 0 ->
        begin
          match pos with
            `FirstChild -> assert false
          | `After -> xml :: node :: q
        end
    | (x, `D s) :: q, Path_cdata n ->
        (x, `D s) :: iter q (Path_cdata (n-1))
    | ((x, `E (name,atts,subs) as xml) :: q, Path_node (name2, n, next)) when name = name2 ->
        if n = 0 then
          (match next with
             None ->
               begin
                 match pos with
                   `FirstChild -> (x, `E(name,atts,node::subs)) :: q
                 | `After -> xml :: node :: q
               end
           | Some p -> [x, `E (name, atts, iter subs p)] @ q
          )
        else
          xml :: iter q (Path_node (name2, n-1, next))
    | xml :: q, p ->
        xml :: iter q p
    | [], _ -> assert false
  in
  match iter [t] path with
    [t] -> t
  | _ -> assert false

let patch_xmlnode t path op =
  try
    match op with
      PMove (newpath, pos) ->
        dbg (string_of_patch_operation (path, op));
        let removed, t = remove_xmlnode t path in
        dbg ("node removed");
        insert_xmlnode t removed newpath pos
    | _ ->
        let apply xml op =
          match xml, op with
          | _, PReplace tree -> [xmlnode_of_xmltree tree]
          | _, PInsert (tree, `After) -> [ xml ; xmlnode_of_xmltree tree ]
          | (_, `D _), PInsert (_, `FirstChild) -> assert false
          | (x, `E (tag,atts,subs)), PInsert (tree, `FirstChild) ->
              [ (x, `E (tag, atts, xmlnode_of_xmltree tree :: subs)) ]
          | _, PDelete -> []
          | (x, _), PUpdateCData s -> [(x, `D s)]
          | (x, `D _), PUpdateNode (name, atts) -> [x, `E (name,atts,[])]
          | (x, `E (_,_,subs)), PUpdateNode (name, atts) -> [x, `E (name,atts,subs)]
          | (_,_), PMove (_, _) -> assert false
        in
        let rec iter xmls path =
          match xmls, path with
            ((x, `D _) as xml):: q, Path_cdata 0 -> (apply xml op) @ q
          | (x, `D s) :: q, Path_cdata n ->
              (x, `D s) :: iter q (Path_cdata (n-1))
          | ((x, `E (name,atts,subs) as xml) :: q, Path_node (name2, n, next)) when name = name2 ->
              if n = 0 then
                (match next with
                   None -> (apply xml op) @ q
                 | Some p -> [x, `E (name, atts, iter subs p)] @ q
                )
              else
                xml :: iter q (Path_node (name2, n-1, next))
          | xml :: q, p ->
              xml :: iter q p
          | [], _ -> assert false
        in
        match iter [t] path with
          [t] -> t
        | _ -> assert false
  with
    Failure msg ->
      failwith ("Error: "^msg^" when applying path:\n"^
       (string_of_patch_operation (path, op)))

let patch_of_action (t1, to_move, patch) action =
  dbg ("patch_of_action: "^string_of_action action);
  dbg ("to_move = "^(String.concat ", " (List.map string_of_int (Intset.elements to_move))));
  match action with
  | Replace (n2, i) ->
      let xmltree2 = n2.xml in
      let (path, _) = path_of_id to_move t1 i in
      let op = PReplace xmltree2 in
      let t1 = patch_xmlnode t1 path op in
      (t1, to_move, (path, op) :: patch)
  | Move (i, parent, new_parent, rank) ->
      let (path, _) = path_of_id to_move t1 i in
      let (new_path, pos) = path_of_id to_move t1 ~rank ~skip: i new_parent in
      let op = PMove (new_path, pos) in
      let t1 = patch_xmlnode t1 path op in
      let to_move = Intset.remove i to_move in
      (t1, to_move, (path, op) :: patch)
  | MoveRank (i, parent, rank) ->
      let (path, _) = path_of_id to_move t1 i in
      let (new_path, pos) = path_of_id to_move t1 ~rank ~skip: i parent in
      let op = PMove (new_path, pos) in
      let t1_patched = patch_xmlnode t1 path op in
      let to_move = Intset.remove i to_move in
      if t1_patched = t1 then
        (t1, to_move, patch)
      else
        (t1_patched, to_move, (path, op) :: patch)
  | Insert (n2, i, rank) ->
      let xmltree2 = n2.xml in
      let (path, pos) = path_of_id to_move t1 ~rank i in
      let op = PInsert (xmltree2, pos) in
      let t1 = patch_xmlnode t1 path op in
      (t1, to_move, (path, op) :: patch)
  | Delete i ->
      begin
        match path_of_id to_move t1 i.number with
        | exception _ ->
           (* the node may have already been deleted when its parent was,
              see sort_actions: we delete first higher nodes, to prevent
              deleting all children then parent *)
            let to_move = Intset.remove i.number to_move in
            (t1, to_move, patch)
        | (path,_) ->
            let op = PDelete in
            let t1 = patch_xmlnode t1 path op in
            let to_move = Intset.remove i.number to_move in
            (t1, to_move, (path, op) :: patch)
      end
  | Edit (n1, n2) ->
      let (path,_) = path_of_id to_move t1 n1.number in
      let op =
        match n1.xml, n2.xml with
          _ , `D s2 -> PUpdateCData s2
        | `E (_,_,_), `E (name,atts,_) -> PUpdateNode (name, atts)
        | `D _, `E (name,atts,subs) -> PUpdateNode (name, atts)
      in
      let t1 = patch_xmlnode t1 path op in
      (t1, to_move, (path, op) :: patch)

let rec xmltree_of_xmlnode = function
  (_, `D s) -> `D s
| (_, `E (tag,atts,subs)) -> `E (tag, atts, List.map xmltree_of_xmlnode subs)

let nodes_to_move =
  let add acc = function
    Move (i,_,_,_)
  | MoveRank (i,_,_) -> Intset.add i acc
  | Delete i -> Intset.add i.number acc
  | _ -> acc
  in
  List.fold_left add Intset.empty

let patch_of_actions t1 t2 actions =
  let to_move = nodes_to_move actions in
  let nodes1 = xmlnode_of_t t1.nodes in
  on_dbg (fun () -> file_of_string ~file: "/tmp/before_patch.dot" (dot_of_xmlnode nodes1)) ();
  let (nodes1, to_move, l) = List.fold_left patch_of_action (nodes1, to_move, []) actions in
  assert (Intset.is_empty to_move);
  on_dbg (fun () -> file_of_string ~file: "/tmp/patch_result.dot" (dot_of_xmlnode nodes1)) ();

  let t1 = xmltree_of_xmlnode nodes1 in
  on_dbg (fun () ->
     let t2 = xmltree_of_xmlnode (xmlnode_of_t t2.nodes) in
     file_of_string ~file: "/tmp/xml1.xml" (string_of_xml t1) ;
     file_of_string ~file: "/tmp/xml2.xml" (string_of_xml t2) ;
  ) ();
  (List.rev l, t1)

let diff_with_final_tree ?cut xml1 xml2 =
  let t1 = t_of_xml ?cut xml1 in
  let t2 = t_of_xml ?cut xml2 in

  on_dbg (fun () ->
     file_of_string ~file: "/tmp/t1.dot" (dot_of_t t1);
     file_of_string ~file: "/tmp/t2.dot" (dot_of_t t2);
  ) ();

  let actions = compute t1 t2 in
  dbg ("actions=\n  "^(String.concat "\n  " (List.map string_of_action actions)));
  patch_of_actions t1 t2 actions

let diff ?cut xml1 xml2 = fst (diff_with_final_tree ?cut xml1 xml2)


OCaml

Innovation. Community. Security.