Legend:
Page
Library
Module
Module type
Parameter
Class
Class type
Source
Page
Library
Module
Module type
Parameter
Class
Class type
Source
plain_diff.ml1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357(* This files comes from camlp5 (ocaml_src/lib/diff.ml). *) (* * Copyright (c) 2007-2013, INRIA (Institut National de Recherches en * Informatique et Automatique). All rights reserved. * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of INRIA, nor the names of its contributors may be * used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY INRIA AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INRIA AND * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. *) (* $Id: diff.ml,v 1.2 2013-02-26 08:15:06 deraugla Exp $ *) (* Parts of Code of GNU diff (diffseq.h and analyze.c) translated to OCaml and adjusted. Basic algorithm described by Eugene W.Myers in: "An O(ND) Difference Algorithm and Its Variations" *) open Base (* A partition is the midpoint of the shortest edit script for a specified portion of two vectors. [xmid, ymid] is the midpoint discovered. The diagonal number [xmid - ymid] equals the number of inserted elements minus the number of deleted elements (counting only elements before the midpoint). [lo_minimal] is true iff the minimal edit script for the left half of the partition is known; similarly for [hi_minimal]. *) module Partition = struct type t = { xmid : int ; ymid : int ; lo_minimal : bool ; hi_minimal : bool } end (* We keep this file in a C-like style so that we can easily compare against the original C, in which we have great confidence. *) (* Find the midpoint of the shortest edit script for a specified portion of the two vectors. Scan from the beginnings of the vectors, and simultaneously from the ends, doing a breadth-first search through the space of edit-sequence. When the two searches meet, we have found the midpoint of the shortest edit sequence. If [find_minimal] is true, find the minimal edit script regardless of expense. Otherwise, if the search is too expensive, use heuristics to stop the search and report a suboptimal answer. This function assumes that the first elements of the specified portions of the two vectors do not match, and likewise that the last elements do not match. The caller must trim matching elements from the beginning and end of the portions it is going to specify. If we return the "wrong" partitions, the worst this can do is cause suboptimal diff output. It cannot cause incorrect diff output. *) let diag ~fd ~bd ~sh ~xv ~yv ~xoff ~xlim ~yoff ~ylim ~too_expensive ~find_minimal : Partition.t = let dmin = xoff - ylim (* minimum valid diagonal *) in let dmax = xlim - yoff (* maximum valid diagonal *) in let fmid = xoff - yoff (* center diagonal of forward search *) in let bmid = xlim - ylim (* center diagonal of backward search *) in (* southeast corner is on an odd diagonal w.r.t the northwest *) let odd = (fmid - bmid) land 1 <> 0 in (* [sh] is an offset that lets us use indices in [[-(m+1), n+1]]. *) fd.(sh + fmid) <- xoff; bd.(sh + bmid) <- xlim; With_return.with_return (fun ({ return } : Partition.t With_return.return) -> (* [c] is cost. [fmin], [fmax] are limits of the forward search. [bmin], [bmax] are limits of the backward search. *) let rec loop ~c ~fmin ~fmax ~bmin ~bmax = (* Extend the forward search by one edit step in each diagonal. *) let fmin = if fmin > dmin then ( fd.(sh + fmin - 2) <- -1; fmin - 1) else fmin + 1 in let fmax = if fmax < dmax then ( fd.(sh + fmax + 2) <- -1; fmax + 1) else fmax - 1 in (* [d] is the active diagonal. *) (let rec loop d = if d < fmin then () else ( let tlo = fd.(sh + d - 1) in let thi = fd.(sh + d + 1) in let x = if tlo >= thi then tlo + 1 else thi in let x, y = let rec loop ~xv ~yv ~xlim ~ylim ~x ~y = if x < xlim && y < ylim && phys_equal (xv x) (yv y) then loop ~xv ~yv ~xlim ~ylim ~x:(x + 1) ~y:(y + 1) else x, y in loop ~xv ~yv ~xlim ~ylim ~x ~y:(x - d) in fd.(sh + d) <- x; if odd && bmin <= d && d <= bmax && bd.(sh + d) <= fd.(sh + d) then return { xmid = x; ymid = y; lo_minimal = true; hi_minimal = true } else loop (d - 2)) in loop fmax); (* Similarly extend the backward search. *) let bmin = if bmin > dmin then ( bd.(sh + bmin - 2) <- Int.max_value; bmin - 1) else bmin + 1 in let bmax = if bmax < dmax then ( bd.(sh + bmax + 2) <- Int.max_value; bmax + 1) else bmax - 1 in (let rec loop d = if d < bmin then () else ( let tlo = bd.(sh + d - 1) in let thi = bd.(sh + d + 1) in let x = if tlo < thi then tlo else thi - 1 in let x, y = let rec loop ~xv ~yv ~xoff ~yoff ~x ~y = if x > xoff && y > yoff && phys_equal (xv (x - 1)) (yv (y - 1)) then loop ~xv ~yv ~xoff ~yoff ~x:(x - 1) ~y:(y - 1) else x, y in loop ~xv ~yv ~xoff ~yoff ~x ~y:(x - d) in bd.(sh + d) <- x; if (not odd) && fmin <= d && d <= fmax && bd.(sh + d) <= fd.(sh + d) then return { xmid = x; ymid = y; lo_minimal = true; hi_minimal = true } else loop (d - 2)) in loop bmax); (* Heuristic: if we've gone well beyond the call of duty, give up and report halfway between our best results so far. *) if (not find_minimal) && c >= too_expensive then ( (* Find forward diagonal that maximizes [x + y]. *) let fxybest, fxbest = let rec loop ~d ~fxybest ~fxbest = if d < fmin then fxybest, fxbest else ( let x = Int.min fd.(sh + d) xlim in let y = x - d in let x, y = if ylim < y then ylim + d, ylim else x, y in let fxybest, fxbest = if fxybest < x + y then x + y, x else fxybest, fxbest in loop ~d:(d - 2) ~fxybest ~fxbest) in loop ~d:fmax ~fxybest:(-1) ~fxbest:fmax in (* Find backward diagonal that minimizes [x + y]. *) let bxybest, bxbest = let rec loop ~d ~bxybest ~bxbest = if d < bmin then bxybest, bxbest else ( let x = Int.max xoff bd.(sh + d) in let y = x - d in let x, y = if y < yoff then yoff + d, yoff else x, y in let bxybest, bxbest = if x + y < bxybest then x + y, x else bxybest, bxbest in loop ~d:(d - 2) ~bxybest ~bxbest) in loop ~d:bmax ~bxybest:Int.max_value ~bxbest:bmax in if xlim + ylim - bxybest < fxybest - (xoff + yoff) then return { xmid = fxbest ; ymid = fxybest - fxbest ; lo_minimal = true ; hi_minimal = false } else return { xmid = bxbest ; ymid = bxybest - bxbest ; lo_minimal = false ; hi_minimal = true }) else loop ~c:(c + 1) ~fmin ~fmax ~bmin ~bmax in loop ~c:1 ~fmin:fmid ~fmax:fmid ~bmin:bmid ~bmax:bmid) ;; let diff_loop ~cutoff a ai b bi n m = let fd = Array.create ~len:(n + m + 3) 0 in let bd = Array.create ~len:(n + m + 3) 0 in let sh = m + 1 in let too_expensive = match cutoff with | Some c -> c | None -> let diags = n + m + 3 in let rec loop diags too_expensive = if diags = 0 then too_expensive else loop (diags asr 2) (too_expensive lsl 1) in Int.max 4096 (loop diags 1) in let xvec i = a.(ai.(i)) in let yvec j = b.(bi.(j)) in let chng1 = Array.create ~len:(Array.length a) true in let chng2 = Array.create ~len:(Array.length b) true in for i = 0 to n - 1 do chng1.(ai.(i)) <- false done; for j = 0 to m - 1 do chng2.(bi.(j)) <- false done; let rec loop ~xoff ~xlim ~yoff ~ylim ~find_minimal = let xoff, yoff = let rec loop ~xoff ~yoff = if xoff < xlim && yoff < ylim && phys_equal (xvec xoff) (yvec yoff) then loop ~xoff:(xoff + 1) ~yoff:(yoff + 1) else xoff, yoff in loop ~xoff ~yoff in let xlim, ylim = let rec loop ~xlim ~ylim = if xlim > xoff && ylim > yoff && phys_equal (xvec (xlim - 1)) (yvec (ylim - 1)) then loop ~xlim:(xlim - 1) ~ylim:(ylim - 1) else xlim, ylim in loop ~xlim ~ylim in if xoff = xlim then for y = yoff to ylim - 1 do chng2.(bi.(y)) <- true done else if yoff = ylim then for x = xoff to xlim - 1 do chng1.(ai.(x)) <- true done else ( let { Partition.xmid; ymid; lo_minimal; hi_minimal } = diag ~fd ~bd ~sh ~xv:xvec ~yv:yvec ~xoff ~xlim ~yoff ~ylim ~too_expensive ~find_minimal in loop ~xoff ~xlim:xmid ~yoff ~ylim:ymid ~find_minimal:lo_minimal; loop ~xoff:xmid ~xlim ~yoff:ymid ~ylim ~find_minimal:hi_minimal) in loop ~xoff:0 ~xlim:n ~yoff:0 ~ylim:m ~find_minimal:false; chng1, chng2 ;; (* [make_indexer a b] returns an array of the indices of items of [a] which are also present in [b]; this way, the main algorithm can skip items which, anyway, are different. This improves the speed much. At the same time, this function updates the items of [a] and [b] so that all equal items point to the same unique item. All item comparisons in the main algorithm can therefore be done with [phys_equal] instead of [=], which can improve speed much. *) let make_indexer hashable a b = let n = Array.length a in let htb = Hashtbl.create hashable ~size:(10 * Array.length b) in Array.iteri ~f:(fun i e -> match Hashtbl.find htb e with | Some v -> b.(i) <- v | None -> Hashtbl.add_exn htb ~key:e ~data:e) b; let ai = Array.create ~len:n 0 in let k = let rec loop i k = if i = n then k else ( let k = match Hashtbl.find htb a.(i) with | Some v -> a.(i) <- v; ai.(k) <- i; k + 1 | None -> k in loop (i + 1) k) in loop 0 0 in Array.sub ai ~pos:0 ~len:k ;; let f ~cutoff ~hashable a b = let ai = make_indexer hashable a b in let bi = make_indexer hashable b a in let n = Array.length ai in let m = Array.length bi in diff_loop ~cutoff a ai b bi n m ;; let iter_matches ?cutoff ~f:ff ~hashable a b = let d1, d2 = f ~cutoff ~hashable a b in let rec aux i1 i2 = if i1 >= Array.length d1 || i2 >= Array.length d2 then () else if not d1.(i1) then if not d2.(i2) then ( ff (i1, i2); aux (i1 + 1) (i2 + 1)) else aux i1 (i2 + 1) else if not d2.(i2) then aux (i1 + 1) i2 else aux (i1 + 1) (i2 + 1) in aux 0 0 ;;