Legend:
Page
Library
Module
Module type
Parameter
Class
Class type
Source
Page
Library
Module
Module type
Parameter
Class
Class type
Source
qcow.ml1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593(* * Copyright (C) 2015 David Scott <dave@recoil.org> * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. * *) open Result open Qcow_types module Error = Qcow_error module Header = Qcow_header module Virtual = Qcow_virtual module Physical = Qcow_physical module Locks = Qcow_locks module Cstructs = Qcow_cstructs module Int = Qcow_int module Int64 = Qcow_types.Int64 let ( <| ) = Int64.shift_left let ( |> ) = Int64.shift_right_logical let src = let src = Logs.Src.create "qcow" ~doc:"qcow2-formatted BLOCK device" in Logs.Src.set_level src (Some Logs.Info) ; src module Log = (val Logs.src_log src : Logs.LOG) module DebugSetting = struct let compact_mid_write = ref false end open Prometheus exception Compressed_unsupported module Metrics = struct let namespace = "Mirage" let subsystem = "qcow" let label_name = "id" let reads = let help = "Number of bytes read" in Counter.v_label ~label_name ~help ~namespace ~subsystem "reads" let writes = let help = "Number of bytes written" in Counter.v_label ~label_name ~help ~namespace ~subsystem "writes" let discards = let help = "Number of bytes discarded" in Counter.v_label ~label_name ~help ~namespace ~subsystem "discards" end module Make (Base : Qcow_s.RESIZABLE_BLOCK) = struct (* samoht: `Msg should be the list of all possible exceptions *) type error = [Mirage_block.error | `Msg of string] module Lwt_error = Qcow_error.Lwt_error (* samoht: `Msg should be the list of all possible exceptions *) type write_error = [Mirage_block.write_error | `Msg of string] module Lwt_write_error = Qcow_error.Lwt_write_error let pp_error ppf = function | #Mirage_block.error as e -> Mirage_block.pp_error ppf e | `Msg s -> Fmt.string ppf s let pp_write_error ppf = function | #Mirage_block.write_error as e -> Mirage_block.pp_write_error ppf e | `Msg s -> Fmt.string ppf s module Config = Qcow_config (* Qemu-img will 'allocate' the last cluster by writing only the last sector. Cope with this by assuming all later sectors are full of zeroes *) module B = Qcow_padded.Make (Base) (* Run all threads in parallel, wait for all to complete, then iterate through the results and return the first failure we discover. *) let iter_p f xs = let threads = List.map f xs in Lwt_list.fold_left_s (fun acc t -> match acc with | Error x -> Lwt.return (Error x) (* first error wins *) | Ok () -> t ) (Ok ()) threads module Cache = Qcow_cache module Recycler = Qcow_recycler.Make (B) module Metadata = Qcow_metadata module Stats = struct type t = {mutable nr_erased: int64; mutable nr_unmapped: int64} let zero = {nr_erased= 0L; nr_unmapped= 0L} end type t = { mutable h: Header.t ; base: B.t ; config: Config.t ; info: Mirage_block.info ; cache: Cache.t ; locks: Locks.t ; recycler: Recycler.t ; metadata: Metadata.t ; (* for convenience *) cluster_bits: int ; sector_size: int ; mutable lazy_refcounts: bool (* true if we are omitting refcounts right now *) ; mutable stats: Stats.t ; mutable cluster_map: Qcow_cluster_map.t (* a live map of the allocated storage *) ; cluster_map_m: Lwt_mutex.t } let get_info t = Lwt.return t.info let to_config t = t.config let get_stats t = t.stats let malloc t = let cluster_bits = Int32.to_int t.Header.cluster_bits in let npages = max 1 (1 lsl (cluster_bits - 12)) in let pages = Io_page.(to_cstruct (get npages)) in Cstruct.sub pages 0 (1 lsl cluster_bits) (* Mmarshal a disk physical address written at a given offset within the disk. *) let marshal_physical_address ?client t offset v = let cluster = Physical.cluster ~cluster_bits:t.cluster_bits offset in Metadata.update ?client t.metadata cluster (fun c -> let addresses = Metadata.Physical.of_contents c in let within = Physical.within_cluster ~cluster_bits:t.cluster_bits offset in try Metadata.Physical.set addresses within v ; Lwt.return (Ok ()) with e -> Lwt.fail e ) (* Unmarshal a disk physical address written at a given offset within the disk. *) let unmarshal_physical_address ?client t offset = let cluster = Physical.cluster ~cluster_bits:t.cluster_bits offset in let open Lwt_error.Infix in Metadata.read_and_lock ?client t.metadata cluster >>= fun (c, lock) -> let addresses = Metadata.Physical.of_contents c in let within = Physical.within_cluster ~cluster_bits:t.cluster_bits offset in Lwt.return (Ok (Metadata.Physical.get addresses within, lock)) let adapt_error : B.error -> error = function | #Mirage_block.error as e -> e | _ -> `Msg "Unknown error" let adapt_write_error : B.write_error -> write_error = function | #Mirage_block.write_error as e -> e | _ -> `Msg "Unknown error" let adapt_write_error_result = function | Error e -> Lwt.return_error (adapt_write_error e) | Ok x -> Lwt.return_ok x let update_header t h = let cluster = malloc t.h in match Header.write h cluster with | Result.Ok _ -> ( let open Lwt.Infix in B.write t.base 0L [cluster] >>= function | Error e -> Lwt.return_error (adapt_write_error e) | Ok () -> ( Recycler.flush t.recycler >>= function | Error e -> Lwt.return_error (adapt_write_error e) | Ok () -> Log.debug (fun f -> f "Written header") ; t.h <- h ; Lwt.return (Ok ()) ) ) | Result.Error (`Msg m) -> Lwt.return (Error (`Msg m)) let resize_base base sector_size cluster_map new_size = let sector, within = Physical.to_sector ~sector_size new_size in if within <> 0 then Lwt.return (Error (`Msg (Printf.sprintf "Internal error: attempting to resize to a non-sector multiple \ %s" (Physical.to_string new_size) ) ) ) else let open Lwt.Infix in ( match cluster_map with | Some (cluster_map, cluster_bits) -> let cluster = Physical.cluster ~cluster_bits new_size in Qcow_cluster_map.resize cluster_map cluster | None -> () ) ; B.resize base sector >>= function | Error e -> Lwt.return_error (adapt_write_error e) | Ok () -> Log.debug (fun f -> f "Resized device to %d bytes" (Qcow_physical.to_bytes new_size) ) ; Lwt.return (Ok ()) module ClusterIO = struct (** Allocate [n] clusters and registers them as new roots in the cluster map where [set] is a a set of possibly non-contiguous physical clusters which are guaranteed to contain zeroes. This must be called via Locks.with_metadata_lock, to prevent a parallel thread allocating another cluster for the same purpose. This also prevents the recycling thread from resizing the file concurrently. *) let allocate_clusters t n = let sectors_per_cluster = (1 lsl t.cluster_bits) / t.sector_size in let open Lwt.Infix in B.get_info t.base >>= fun base_info -> let open Lwt_write_error.Infix in (* If there is junk beyond the last block because someone just discarded something then truncate the file to erase it. *) let last_block = Qcow_cluster_map.get_last_block t.cluster_map in let last_file_block = Cluster.of_int (Int64.to_int base_info.Mirage_block.size_sectors / sectors_per_cluster - 1 ) in assert (last_block <= last_file_block) ; let rest_of_file = if last_block = last_file_block then Cluster.IntervalSet.empty else Cluster.IntervalSet.( add (Interval.make (Cluster.succ last_block) last_file_block) empty ) in ( if Cluster.IntervalSet.( not @@ is_empty @@ inter rest_of_file @@ Qcow_cluster_map.Junk.get t.cluster_map ) then ( Log.debug (fun f -> f "Allocator: there is junk after the last block %s, shrinking \ file" (Cluster.to_string last_block) ) ; let size_clusters_should_be = Cluster.to_int last_block + 1 in let p = Physical.make (size_clusters_should_be lsl t.cluster_bits) in let size_sectors = Physical.sector ~sector_size:t.sector_size p in resize_base t.base t.sector_size (Some (t.cluster_map, t.cluster_bits)) p >>= fun () -> Log.debug (fun f -> f "Resized file to %d clusters (%Ld sectors)" size_clusters_should_be size_sectors ) ; Lwt.return (Ok size_sectors) ) else Lwt.return (Ok base_info.Mirage_block.size_sectors) ) >>= fun size_sectors -> let limit = 256 in (* 16 MiB *) let quantum = 512 in (* 32 MiB *) let max_cluster_needed = Cluster.to_int last_block + n in let len_cluster = Int64.to_int size_sectors / sectors_per_cluster in let len_cluster_should_be = if len_cluster - max_cluster_needed < limit then len_cluster + quantum else len_cluster in (* keep it the same *) ( if len_cluster_should_be <> len_cluster then ( Log.info (fun f -> f "Allocator: %s" (Qcow_cluster_map.to_summary_string t.cluster_map) ) ; Log.info (fun f -> f "Allocator: file contains cluster 0 .. %d will enlarge file to \ 0 .. %d" (len_cluster - 1) (len_cluster_should_be - 1) ) ; (* Resync the file size only *) let p = Physical.make (len_cluster_should_be lsl t.cluster_bits) in let size_sectors = Physical.sector ~sector_size:t.sector_size p in resize_base t.base t.sector_size (Some (t.cluster_map, t.cluster_bits)) p >>= fun () -> Log.debug (fun f -> f "Resized file to %d clusters (%Ld sectors)" len_cluster_should_be size_sectors ) ; Lwt.return (Ok ()) ) else Lwt.return (Ok ()) ) >>= fun () -> match Recycler.allocate t.recycler (Cluster.of_int n) with | Some set -> Log.debug (fun f -> f "Allocated %d clusters from free list" n) ; Lwt.return (Ok set) | None -> assert false (* never happens because of the `resize_base` above *) module Refcount = struct (* The refcount table contains pointers to clusters which themselves contain the 2-byte refcounts *) let zero_all ?client t = (* Zero all clusters allocated in the refcount table *) let cluster = Physical.cluster ~cluster_bits:t.cluster_bits t.h.Header.refcount_table_offset in let refcount_table_clusters = Int32.to_int t.h.Header.refcount_table_clusters in let rec loop i = if i >= refcount_table_clusters then Lwt.return (Ok ()) else (* `read` expects the function to be read-only, however we cheat and perform write operations inside the read context *) let open Lwt_error.Infix in Metadata.read ?client t.metadata Cluster.(add cluster (of_int i)) (fun c -> let addresses = Metadata.Physical.of_contents c in let rec loop i = if i >= Metadata.Physical.len addresses then Lwt.return (Ok ()) else let open Lwt_write_error.Infix in let addr = Metadata.Physical.get addresses i in ( if Physical.to_bytes addr <> 0 then let cluster = Physical.cluster ~cluster_bits:t.cluster_bits addr in Metadata.update ?client t.metadata cluster (fun c -> Metadata.erase c ; Lwt.return (Ok ()) ) >>= fun () -> let open Lwt.Infix in Recycler.flush t.recycler >>= adapt_write_error_result else Lwt.return (Ok ()) ) >>= fun () -> loop (i + 1) in let open Lwt.Infix in loop 0 >>= function | Error `Disconnected -> Lwt.return (Error `Disconnected) | Error `Is_read_only -> Lwt.return (Error (`Msg "Device is read only")) | Error (`Msg m) -> Lwt.return (Error (`Msg m)) | Ok () -> Lwt.return (Ok ()) ) >>= fun () -> loop (i + 1) in loop 0 let read ?client t cluster = let cluster = Cluster.to_int64 cluster in let within_table = Int64.(div cluster (Header.refcounts_per_cluster t.h)) in let within_cluster = Int64.(to_int (rem cluster (Header.refcounts_per_cluster t.h))) in let offset = Physical.add t.h.Header.refcount_table_offset (8 * Int64.to_int within_table) in let open Lwt_error.Infix in unmarshal_physical_address ?client t offset >>= fun (offset, lock) -> Lwt.finalize (fun () -> if Physical.to_bytes offset = 0 then Lwt.return (Ok 0) else let cluster = Physical.cluster ~cluster_bits:t.cluster_bits offset in Metadata.read ?client t.metadata cluster (fun c -> let refcounts = Metadata.Refcounts.of_contents c in Lwt.return (Ok (Metadata.Refcounts.get refcounts within_cluster)) ) ) (fun () -> Locks.unlock lock ; Lwt.return_unit) (** Decrement the refcount of a given cluster. This will never need to allocate. We never bother to deallocate refcount clusters which are empty. *) let really_decr ?client t cluster = let cluster = Cluster.to_int64 cluster in let within_table = Int64.(div cluster (Header.refcounts_per_cluster t.h)) in let within_cluster = Int64.(to_int (rem cluster (Header.refcounts_per_cluster t.h))) in let offset = Physical.add t.h.Header.refcount_table_offset (8 * Int64.to_int within_table) in let open Lwt_write_error.Infix in unmarshal_physical_address ?client t offset >>= fun (offset, lock) -> Lwt.finalize (fun () -> if Physical.to_bytes offset = 0 then ( Log.err (fun f -> f "Refcount.decr: cluster %Ld has no refcount cluster \ allocated" cluster ) ; Lwt.return (Error (`Msg (Printf.sprintf "Refcount.decr: cluster %Ld has no refcount cluster \ allocated" cluster ) ) ) ) else let cluster = Physical.cluster ~cluster_bits:t.cluster_bits offset in Metadata.update ?client t.metadata cluster (fun c -> let refcounts = Metadata.Refcounts.of_contents c in let current = Metadata.Refcounts.get refcounts within_cluster in if current = 0 then ( Log.err (fun f -> f "Refcount.decr: cluster %s already has a refcount of \ 0" (Cluster.to_string cluster) ) ; Lwt.return (Error (`Msg (Printf.sprintf "Refcount.decr: cluster %s already has a \ refcount of 0" (Cluster.to_string cluster) ) ) ) ) else ( Metadata.Refcounts.set refcounts within_cluster (current - 1) ; Lwt.return (Ok ()) ) ) ) (fun () -> Locks.unlock lock ; Lwt.return_unit) (** Increment the refcount of a given cluster. Note this might need to allocate itself, to enlarge the refcount table. When this function returns the refcount is guaranteed to have been persisted. *) let rec really_incr ?client t cluster = let open Lwt_write_error.Infix in let cluster = Cluster.to_int64 cluster in let within_table = Int64.(div cluster (Header.refcounts_per_cluster t.h)) in let within_cluster = Int64.(to_int (rem cluster (Header.refcounts_per_cluster t.h))) in (* If the table (containing pointers to clusters which contain the refcounts) is too small, then reallocate it now. *) let cluster_containing_pointer = let within_table_offset = Int64.mul within_table 8L in within_table_offset |> t.cluster_bits in let current_size_clusters = Int64.of_int32 t.h.Header.refcount_table_clusters in ( if cluster_containing_pointer >= current_size_clusters then let needed = Header.max_refcount_table_size t.h in (* Make sure this is actually an increase: make the table 2x larger if not *) let needed = if needed = current_size_clusters then Int64.mul 2L current_size_clusters else needed in allocate_clusters t (Int64.to_int needed) >>= fun free -> Lwt.finalize (fun () -> (* Erasing new blocks is handled after the copy *) (* Copy any existing refcounts into new table *) let buf = malloc t.h in let rec loop free i = if i >= Int32.to_int t.h.Header.refcount_table_clusters then Lwt.return (Ok ()) else let physical = Physical.add t.h.Header.refcount_table_offset (i lsl t.cluster_bits) in let src = Physical.cluster ~cluster_bits:t.cluster_bits physical in let first = Cluster.IntervalSet.(Interval.x (min_elt free)) in let physical = Physical.make (Cluster.to_int first lsl t.cluster_bits) in let dst = Physical.cluster ~cluster_bits:t.cluster_bits physical in let open Lwt.Infix in Recycler.copy t.recycler src dst >>= function | Error e -> Lwt.return_error (adapt_write_error e) | Ok () -> let free = Cluster.IntervalSet.( remove (Interval.make first first) free ) in loop free (i + 1) in loop free 0 >>= fun () -> Log.debug (fun f -> f "Copied refcounts into new table") ; (* Zero new blocks *) Cstruct.memset buf 0 ; let rec loop free i = if i >= needed then Lwt.return (Ok ()) else let first = Cluster.IntervalSet.(Interval.x (min_elt free)) in let physical = Physical.make (Cluster.to_int first lsl t.cluster_bits) in let sector, _ = Physical.to_sector ~sector_size:t.sector_size physical in let open Lwt.Infix in B.write t.base sector [buf] >>= function | Error e -> Lwt.return_error (adapt_write_error e) | Ok () -> let free = Cluster.IntervalSet.( remove (Interval.make first first) free ) in loop free (Int64.succ i) in loop free (Int64.of_int32 t.h.Header.refcount_table_clusters) >>= fun () -> let first = Cluster.IntervalSet.(Interval.x (min_elt free)) in let refcount_table_offset = Physical.make (Cluster.to_int first lsl t.cluster_bits) in let h' = { t.h with Header.refcount_table_offset ; refcount_table_clusters= Int64.to_int32 needed } in update_header t h' >>= fun () -> (* increase the refcount of the clusters we just allocated *) let rec loop free i = if i >= needed then Lwt.return (Ok ()) else let first = Cluster.IntervalSet.(Interval.x (min_elt free)) in really_incr ?client t first >>= fun () -> let free = Cluster.IntervalSet.( remove (Interval.make first first) free ) in loop free (Int64.succ i) in loop free 0L ) (fun () -> Qcow_cluster_map.Roots.remove t.cluster_map free ; Lwt.return_unit ) else Lwt.return (Ok ()) ) >>= fun () -> let offset = Physical.add t.h.Header.refcount_table_offset (8 * Int64.to_int within_table) in unmarshal_physical_address ?client t offset >>= fun (addr, lock) -> Lwt.finalize (fun () -> ( if Physical.to_bytes addr = 0 then allocate_clusters t 1 >>= fun free -> Lwt.finalize (fun () -> let cluster = Cluster.IntervalSet.(Interval.x (min_elt free)) in (* NB: the pointers in the refcount table are different from the pointers in the cluster table: the high order bits are not used to encode extra information and wil confuse qemu/qemu-img. *) let addr = Physical.make (Cluster.to_int cluster lsl t.cluster_bits) in (* zero the cluster *) let buf = malloc t.h in Cstruct.memset buf 0 ; let sector, _ = Physical.to_sector ~sector_size:t.sector_size addr in let open Lwt.Infix in B.write t.base sector [buf] >>= function | Error e -> Lwt.return_error (adapt_write_error e) | Ok () -> ( (* Ensure the new zeroed cluster has been persisted before we reference it via `marshal_physical_address` *) Recycler.flush t.recycler >>= function | Error e -> Lwt.return_error (adapt_write_error e) | Ok () -> ( Log.debug (fun f -> f "Allocated new refcount cluster %s" (Cluster.to_string cluster) ) ; let open Lwt_write_error.Infix in marshal_physical_address ?client t offset addr >>= fun () -> let open Lwt.Infix in Recycler.flush t.recycler >>= function | Error e -> Lwt.return_error (adapt_write_error e) | Ok () -> let open Lwt_write_error.Infix in really_incr ?client t cluster >>= fun () -> Lwt.return (Ok addr) ) ) ) (fun () -> Qcow_cluster_map.Roots.remove t.cluster_map free ; Lwt.return_unit ) else Lwt.return (Ok addr) ) >>= fun offset -> let refcount_cluster = Physical.cluster ~cluster_bits:t.cluster_bits offset in Metadata.update ?client t.metadata refcount_cluster (fun c -> let refcounts = Metadata.Refcounts.of_contents c in let current = Metadata.Refcounts.get refcounts within_cluster in (* We don't support refcounts of more than 1 *) assert (current == 0) ; Metadata.Refcounts.set refcounts within_cluster (current + 1) ; Lwt.return (Ok ()) ) ) (fun () -> Locks.unlock lock ; Lwt.return_unit) >>= fun () -> let open Lwt.Infix in Recycler.flush t.recycler >>= function | Error e -> Lwt.return_error (adapt_write_error e) | Ok () -> Log.debug (fun f -> f "Incremented refcount of cluster %Ld" cluster) ; Lwt.return (Ok ()) (* If the lazy refcounts feature is enabled then don't actually Increment the refcounts. *) let incr ?client t cluster = if t.lazy_refcounts then Lwt.return (Ok ()) else really_incr ?client t cluster let decr ?client t cluster = if t.lazy_refcounts then Lwt.return (Ok ()) else really_decr ?client t cluster end let read_l1_table ?client t l1_index = (* Read l1[l1_index] as a 64-bit offset *) let l1_index_offset = Physical.shift t.h.Header.l1_table_offset (8 * Int64.to_int l1_index) in unmarshal_physical_address ?client t l1_index_offset (* Find the first l1_index whose values satisfies [f] *) let find_mapped_l1_table t l1_index = let open Lwt_error.Infix in (* Read l1[l1_index] as a 64-bit offset *) let rec loop l1_index = if l1_index >= Int64.of_int32 t.h.Header.l1_size then Lwt.return (Ok None) else let l1_index_offset = Physical.shift t.h.Header.l1_table_offset (8 * Int64.to_int l1_index) in let cluster = Physical.cluster ~cluster_bits:t.cluster_bits l1_index_offset in Metadata.read t.metadata cluster (fun c -> let addresses = Metadata.Physical.of_contents c in let within = Physical.within_cluster ~cluster_bits:t.cluster_bits l1_index_offset in let rec loop l1_index i : [`Skip of int | `GotOne of int64] = if i >= Metadata.Physical.len addresses then `Skip i else if Metadata.Physical.get addresses i <> Physical.unmapped then `GotOne l1_index else loop (Int64.succ l1_index) (i + 1) in Lwt.return (Ok (loop l1_index within)) ) >>= function | `GotOne l1_index' -> Lwt.return (Ok (Some l1_index')) | `Skip n -> loop Int64.(add l1_index (of_int n)) in loop l1_index let write_l1_table ?client t l1_index l2_table_offset = let open Lwt_write_error.Infix in (* Always set the mutable flag *) let l2_table_offset = if l2_table_offset = Physical.unmapped then Physical.unmapped (* don't set metadata bits for unmapped clusters *) else Physical.make ~is_mutable:true (Physical.to_bytes l2_table_offset) in (* Write l1[l1_index] as a 64-bit offset *) let l1_index_offset = Physical.shift t.h.Header.l1_table_offset (8 * Int64.to_int l1_index) in marshal_physical_address ?client t l1_index_offset l2_table_offset >>= fun () -> Log.debug (fun f -> f "Written l1_table[%Ld] <- %s" l1_index (Cluster.to_string @@ Physical.cluster ~cluster_bits:t.cluster_bits l2_table_offset ) ) ; Lwt.return (Ok ()) let read_l2_table ?client t l2_table_offset l2_index = let l2_index_offset = Physical.shift l2_table_offset (8 * Int64.to_int l2_index) in unmarshal_physical_address ?client t l2_index_offset let write_l2_table ?client t l2_table_offset l2_index cluster = let open Lwt_write_error.Infix in (* Always set the mutable flag *) let cluster = if cluster = Physical.unmapped then Physical.unmapped (* don't set metadata bits for unmapped clusters *) else Physical.make ~is_mutable:true (Physical.to_bytes cluster) in let l2_index_offset = Physical.shift l2_table_offset (8 * Int64.to_int l2_index) in marshal_physical_address ?client t l2_index_offset cluster >>= fun _ -> Log.debug (fun f -> f "Written l2_table[%Ld] <- %s" l2_index (Cluster.to_string @@ Physical.cluster ~cluster_bits:t.cluster_bits cluster ) ) ; Lwt.return (Ok ()) (* Walk the L1 and L2 tables to translate an address. If a table entry is unallocated then return [None]. Note if a [walk_and_allocate] is racing with us then we may or may not see the mapping. *) let walk_readonly ?client t a = let open Lwt_error.Infix in Locks.with_metadata_lock t.locks (fun () -> read_l1_table ?client t a.Virtual.l1_index >>= fun (l2_table_offset, l1_lock) -> let ( >>|= ) m f = let open Lwt in m >>= function | Error x -> Lwt.return (Error x) | Ok None -> Lwt.return (Ok None) | Ok (Some x) -> f x in (* Look up an L2 table *) ( if Physical.to_bytes l2_table_offset = 0 then ( Locks.unlock l1_lock ; Lwt.return (Ok None) ) else ( if Physical.is_compressed l2_table_offset then raise Compressed_unsupported ; Lwt.return (Ok (Some l2_table_offset)) ) ) >>|= fun l2_table_offset -> (* Look up a cluster *) read_l2_table ?client t l2_table_offset a.Virtual.l2_index >>= fun (cluster_offset, l2_lock) -> ( if Physical.to_bytes cluster_offset = 0 then ( Locks.unlock l1_lock ; Locks.unlock l2_lock ; Lwt.return (Ok None) ) else ( if Physical.is_compressed cluster_offset then raise Compressed_unsupported ; Lwt.return (Ok (Some cluster_offset)) ) ) >>|= fun cluster_offset -> let p = Physical.shift cluster_offset (Int64.to_int a.Virtual.cluster) in Lwt.return (Ok (Some (p, l1_lock, l2_lock))) ) (* Walk the L1 and L2 tables to translate an address, allocating missing entries as we go. *) let walk_and_allocate ?client t a = let open Lwt_write_error.Infix in Locks.with_metadata_lock t.locks (fun () -> read_l1_table ?client t a.Virtual.l1_index >>= fun (l2_offset, l1_lock) -> (* If there is no L2 table entry then allocate L2 and data clusters at the same time to minimise I/O *) ( if Physical.to_bytes l2_offset = 0 then allocate_clusters t 2 >>= fun free -> Lwt.finalize (fun () -> (* FIXME: it's unnecessary to write to the data cluster if we're about to overwrite it with real data straight away *) let open Lwt_write_error.Infix in let l2_cluster = Cluster.IntervalSet.(Interval.x (min_elt free)) in let free = Cluster.IntervalSet.( remove (Interval.make l2_cluster l2_cluster) free ) in let data_cluster = Cluster.IntervalSet.(Interval.x (min_elt free)) in Refcount.incr t l2_cluster >>= fun () -> Refcount.incr t data_cluster >>= fun () -> let l2_offset = Physical.make (Cluster.to_int l2_cluster lsl t.cluster_bits) in let data_offset = Physical.make (Cluster.to_int data_cluster lsl t.cluster_bits) in write_l2_table ?client t l2_offset a.Virtual.l2_index data_offset >>= fun () -> read_l2_table ?client t l2_offset a.Virtual.l2_index >>= fun (data_offset', l2_lock) -> (* NB the new blocks can't be moved within the `allocate_clusters` callback since they are registered as global roots *) assert ( Physical.to_bytes data_offset = Physical.to_bytes data_offset' ) ; write_l1_table ?client t a.Virtual.l1_index l2_offset >>= fun () -> Lwt.return (Ok (data_offset, l1_lock, l2_lock)) ) (fun () -> Qcow_cluster_map.Roots.remove t.cluster_map free ; Lwt.return_unit ) else read_l2_table ?client t l2_offset a.Virtual.l2_index >>= fun (data_offset, l2_lock) -> if Physical.to_bytes data_offset = 0 then allocate_clusters t 1 >>= fun free -> Lwt.finalize (fun () -> let open Lwt_write_error.Infix in let data_cluster = Cluster.IntervalSet.(Interval.x (min_elt free)) in Refcount.incr t data_cluster >>= fun () -> let data_offset = Physical.make (Cluster.to_int data_cluster lsl t.cluster_bits) in let open Lwt_write_error.Infix in write_l2_table ?client t l2_offset a.Virtual.l2_index data_offset >>= fun () -> Lwt.return (Ok (data_offset, l1_lock, l2_lock)) ) (fun () -> Qcow_cluster_map.Roots.remove t.cluster_map free ; Lwt.return_unit ) else ( if Physical.is_compressed data_offset then raise Compressed_unsupported ; Lwt.return (Ok (data_offset, l1_lock, l2_lock)) ) ) >>= fun (data_offset, l1_lock, l2_lock) -> let p = Physical.shift data_offset (Int64.to_int a.Virtual.cluster) in Lwt.return (Ok (p, l1_lock, l2_lock)) ) let walk_and_deallocate ?client t sector n = let open Lwt_write_error.Infix in let sectors_per_cluster = Int64.(div (1L <| t.cluster_bits) (of_int t.sector_size)) in Locks.with_metadata_lock t.locks (fun () -> let get_l2 sector = let byte = Int64.(mul sector (of_int t.info.Mirage_block.sector_size)) in let a = Virtual.make ~cluster_bits:t.cluster_bits byte in read_l1_table ?client t a.Virtual.l1_index >>= fun (l2_offset, l1_lock) -> if Physical.to_bytes l2_offset = 0 then ( Locks.unlock l1_lock ; Lwt.return (Ok None) ) else let l2_index_offset = Physical.shift l2_offset (8 * Int64.to_int a.Virtual.l2_index) in let cluster = Physical.cluster ~cluster_bits:t.cluster_bits l2_index_offset in let within = Physical.within_cluster ~cluster_bits:t.cluster_bits l2_index_offset in Lwt.return (Ok (Some ((cluster, within), l1_lock))) in let rec loop sector n = if n = 0L then Lwt.return (Ok ()) else (get_l2 sector >>= function | None -> (* FIXME: we can almost certainly jump more than this *) Lwt.return (Ok sectors_per_cluster) | Some ((cluster, _), l1_lock) -> Lwt.finalize (fun () -> Metadata.update ?client t.metadata cluster (fun c -> let addresses = Metadata.Physical.of_contents c in (* With the cluster write lock held, complete as many writes to it as we need, unlocking and writing it out once at the end. *) let rec inner acc sector n = if n = 0L then Lwt.return (Ok acc) else get_l2 sector >>= function | None -> Lwt.return (Ok acc) | Some ((cluster', _), l1_lock) when cluster <> cluster' -> Locks.unlock l1_lock ; Lwt.return (Ok acc) | Some ((_, within), l1_lock) -> Locks.unlock l1_lock ; (* still locked above *) let data_offset = Metadata.Physical.get addresses within in if Physical.to_bytes data_offset = 0 then inner (Int64.add acc sectors_per_cluster) (Int64.add sector sectors_per_cluster) (Int64.sub n sectors_per_cluster) else (* The data at [data_offset] is about to become an unreferenced hole in the file *) let current = Metadata.Physical.get addresses within in ( if current <> Physical.unmapped then Locks.Write.with_lock t.locks ?client (Physical.cluster ~cluster_bits:t.cluster_bits current ) (fun () -> (* It's important to hold the write lock because we might be about to erase or copy this block *) Metadata.Physical.set addresses within Physical.unmapped ; t.stats.Stats.nr_unmapped <- Int64.add t.stats.Stats.nr_unmapped sectors_per_cluster ; Lwt.return (Ok ()) ) else Lwt.return (Ok ()) ) >>= fun () -> Refcount.decr t (Physical.cluster ~cluster_bits:t.cluster_bits data_offset ) >>= fun () -> inner (Int64.add acc sectors_per_cluster) (Int64.add sector sectors_per_cluster) (Int64.sub n sectors_per_cluster) in inner 0L sector n ) ) (fun () -> Locks.unlock l1_lock ; Lwt.return_unit) ) >>= fun to_advance -> loop (Int64.add sector to_advance) (Int64.sub n to_advance) in loop sector n ) end (* Starting at byte offset [ofs], map a list of buffers onto a list of [byte offset, buffer] pairs, where - no [byte offset, buffer] pair crosses an [alignment] boundary; - each [buffer] is as large as possible (so for example if we supply one large buffer it will only be fragmented to the minimum extent. *) let rec chop_into_aligned alignment ofs = function | [] -> [] | buf :: bufs -> (* If we're not aligned, sync to the next boundary *) let into = Int64.(to_int (sub alignment (rem ofs alignment))) in if Cstruct.length buf > into then let this = (ofs, Cstruct.sub buf 0 into) in let rest = chop_into_aligned alignment Int64.(add ofs (of_int into)) (Cstruct.shift buf into :: bufs) in this :: rest else (ofs, buf) :: chop_into_aligned alignment Int64.(add ofs (of_int (Cstruct.length buf))) bufs type work = { sector: int64 (* starting sector of the operaiton *) ; bufs: Cstruct.t list ; metadata_locks: Locks.lock list (* read locks on the metadata pointing to the physical clusters: our guarantee that the target physical clusters haven't been moved and the references rewritten *) } (* Given a list of offset, buffer pairs for reading or writing, coalesce adjacent offsets for readv/writev *) let coalesce_into_adjacent sector_size = let rec loop sector bufs metadata_locks next_sector acc = function | [] -> List.rev ({sector; bufs= List.rev bufs; metadata_locks} :: acc) | work :: rest -> let next_sector' = Int64.( add work.sector (of_int (Cstructs.len work.bufs / sector_size)) ) in if next_sector = work.sector then loop sector (work.bufs @ bufs) (work.metadata_locks @ metadata_locks) next_sector' acc rest else loop work.sector work.bufs work.metadata_locks next_sector' ({sector; bufs= List.rev bufs; metadata_locks} :: acc) rest in function | [] -> [] | work :: rest -> let next_sector' = Int64.(add work.sector (of_int (Cstructs.len work.bufs / sector_size))) in loop work.sector work.bufs work.metadata_locks next_sector' [] rest exception Reference_outside_file of int64 * int64 let make_cluster_map t ?id () = let open Qcow_cluster_map in let sectors_per_cluster = Int64.(div (1L <| t.cluster_bits) (of_int t.sector_size)) in let open Lwt.Infix in B.get_info t.base >>= fun base_info -> let max_cluster = Cluster.of_int64 @@ Int64.div base_info.Mirage_block.size_sectors sectors_per_cluster in (* Iterate over the all clusters referenced from all the tables in the file and (a) construct a set of free clusters; and (b) construct a map of physical cluster back to virtual. The free set will show us the holes, and the map will tell us where to get the data from to fill the holes in with. *) let refs = ref Cluster.Map.empty in let refcount_start_cluster = Cluster.to_int64 @@ Physical.cluster ~cluster_bits:t.cluster_bits t.h.Header.refcount_table_offset in let int64s_per_cluster = 1L <| Int32.to_int t.h.Header.cluster_bits - 3 in let l1_table_start_cluster = Cluster.to_int64 @@ Physical.cluster ~cluster_bits:t.cluster_bits t.h.Header.l1_table_offset in let l1_table_clusters = Int64.( div (round_up (of_int32 t.h.Header.l1_size) int64s_per_cluster) int64s_per_cluster ) in (* Assume all clusters are free. Note when the file is sparse we can exceed the max possible cluster. This is only a sanity check to catch crazily-wrong inputs. *) let cluster_size = 1L <| t.cluster_bits in let max_possible_cluster = Cluster.of_int64 (Int64.round_up t.h.Header.size cluster_size |> t.cluster_bits) in let free = Qcow_bitmap.make_full ~initial_size:(Cluster.to_int max_cluster) ~maximum_size:(Cluster.to_int max_possible_cluster * 50) in (* The header structures are untracked by the qcow_cluster_map and we assume they don't move and we don't try to move them. We assume the structures have no holes in them, otherwise we would miscompute the `first_movable_cluster` and accidentally truncate the file. *) Qcow_bitmap.( remove (Interval.make 0L Int64.(pred @@ add l1_table_start_cluster l1_table_clusters) ) free ) ; Qcow_bitmap.( remove (Interval.make 0L Int64.( pred @@ add refcount_start_cluster (Int64.of_int32 t.h.Header.refcount_table_clusters) ) ) free ) ; Qcow_bitmap.(remove (Interval.make 0L 0L) free) ; let first_movable_cluster = try Cluster.of_int64 @@ Qcow_bitmap.min_elt free with Not_found -> max_cluster (* header takes up the whole file *) in let parse x = if x = Physical.unmapped then Cluster.zero else if Physical.is_compressed x then ( Log.err (fun f -> f "Unsupported compressed Cluster Descriptor has been found" ) ; raise Compressed_unsupported ) else Physical.cluster ~cluster_bits:t.cluster_bits x in let mark rf cluster = let c, w = rf in if cluster > max_cluster then ( Log.err (fun f -> f "Found a reference to cluster %s outside the file (max cluster \ %s) from cluster %s.%d" (Cluster.to_string cluster) (Cluster.to_string max_cluster) (Cluster.to_string c) w ) ; let src = Int64.add (Int64.of_int w) (Cluster.to_int64 c <| Int32.to_int t.h.Header.cluster_bits) in let dst = Cluster.to_int64 cluster <| Int32.to_int t.h.Header.cluster_bits in raise (Reference_outside_file (src, dst)) ) ; let c, w = rf in if cluster = Cluster.zero then () else ( if Cluster.Map.mem cluster !refs then ( let c', w' = Cluster.Map.find cluster !refs in Log.err (fun f -> f "Found two references to cluster %s: %s.%d and %s.%d" (Cluster.to_string cluster) (Cluster.to_string c) w (Cluster.to_string c') w' ) ; raise (Error.Duplicate_reference ( (Cluster.to_int64 c, w) , (Cluster.to_int64 c', w') , Cluster.to_int64 cluster ) ) ) ; Qcow_bitmap.( remove (Interval.make (Cluster.to_int64 cluster) (Cluster.to_int64 cluster)) free ) ; refs := Cluster.Map.add cluster rf !refs ) in (* scan the refcount table *) let open Lwt_error.Infix in let rec loop i = if i >= Int64.of_int32 t.h.Header.refcount_table_clusters then Lwt.return (Ok ()) else let refcount_cluster = Cluster.of_int64 @@ Int64.(add refcount_start_cluster i) in Metadata.read t.metadata refcount_cluster (fun c -> let addresses = Metadata.Physical.of_contents c in let rec loop i = if i >= Metadata.Physical.len addresses then Lwt.return (Ok ()) else let cluster = parse (Metadata.Physical.get addresses i) in mark (refcount_cluster, i) cluster ; loop (i + 1) in loop 0 ) >>= fun () -> loop (Int64.succ i) in loop 0L >>= fun () -> (* scan the L1 and L2 tables, marking the L2 and data clusters *) let rec l1_iter i = let l1_table_cluster = Cluster.of_int64 @@ Int64.(add l1_table_start_cluster i) in if i >= l1_table_clusters then Lwt.return (Ok ()) else Metadata.read t.metadata l1_table_cluster (fun c -> let l1 = Metadata.Physical.of_contents c in Lwt.return (Ok l1) ) >>= fun l1 -> let rec l2_iter i = if i >= Metadata.Physical.len l1 then Lwt.return (Ok ()) else let l2_table_cluster = parse (Metadata.Physical.get l1 i) in if l2_table_cluster <> Cluster.zero then ( mark (l1_table_cluster, i) l2_table_cluster ; Metadata.read t.metadata l2_table_cluster (fun c -> let l2 = Metadata.Physical.of_contents c in Lwt.return (Ok l2) ) >>= fun l2 -> let rec data_iter i = if i >= Metadata.Physical.len l2 then Lwt.return (Ok ()) else let cluster = parse (Metadata.Physical.get l2 i) in mark (l2_table_cluster, i) cluster ; data_iter (i + 1) in data_iter 0 >>= fun () -> l2_iter (i + 1) ) else l2_iter (i + 1) in l2_iter 0 >>= fun () -> l1_iter (Int64.succ i) in l1_iter 0L >>= fun () -> let map = make ~free ~refs:!refs ~first_movable_cluster ~cache:t.cache ~runtime_asserts:t.config.Config.runtime_asserts ~id ~cluster_size:(Int64.to_int cluster_size) in Lwt.return (Ok map) type check_result = {free: int64; used: int64} type compact_result = { copied: int64 ; refs_updated: int64 ; old_size: int64 ; new_size: int64 } let compact t ?(progress_cb = fun ~percent:_ -> ()) () = if t.config.Config.read_only then Lwt.return (Error `Is_read_only) else (* We will return a cancellable task to the caller, and on cancel we will set the cancel_requested flag. The main compact loop will detect this and complete the moves already in progress before returning. *) let cancel_requested = ref false in let th, u = Lwt.task () in Lwt.on_cancel th (fun () -> Log.info (fun f -> f "cancellation of compact requested") ; cancel_requested := true ) ; (* Catch stray exceptions and return as unknown errors *) let open Lwt.Infix in Lwt.async (fun () -> Lwt.catch (fun () -> let open Lwt_write_error.Infix in let open Qcow_cluster_map in let map = t.cluster_map in Log.debug (fun f -> f "Disk clusters: %s" (to_summary_string map)) ; let start_last_block = get_last_block map in let sector_size = Int64.of_int t.sector_size in let cluster_bits = Int32.to_int t.h.Header.cluster_bits in let sectors_per_cluster = Int64.div (1L <| cluster_bits) sector_size in let one_pass ?progress_cb () = Qcow_cluster_map.Debug.assert_no_leaked_blocks map ; let moves = Qcow_cluster_map.start_moves map in let open Lwt_write_error.Infix in Recycler.move_all ?progress_cb t.recycler moves >>= fun () -> (* Flush now so that if we crash after updating some of the references, the destination blocks will contain the correct data. *) let open Lwt.Infix in Recycler.flush t.recycler >>= function | Error e -> Lwt.return_error (adapt_write_error e) | Ok () -> ( let open Lwt_write_error.Infix in Recycler.update_references t.recycler >>= fun refs_updated -> (* Flush now so that the pointers are persisted before we truncate the file *) let open Lwt.Infix in Recycler.flush t.recycler >>= function | Error e -> Lwt.return_error (adapt_write_error e) | Ok () -> Lwt.return (Ok refs_updated) ) in one_pass ~progress_cb:(fun ~percent -> progress_cb ~percent:(percent * 80 / 100) ) () >>= fun refs_updated -> if refs_updated <> 0L then Log.info (fun f -> f "Pass 1: %Ld references updated" refs_updated ) ; (* modifying a L2 metadata block will have cancelled the move, so perform an additional pass. *) one_pass ~progress_cb:(fun ~percent -> progress_cb ~percent:(80 + (percent * 4 / 100)) ) () >>= fun refs_updated' -> if refs_updated' <> 0L then Log.info (fun f -> f "Pass 2: %Ld references updated" refs_updated' ) ; one_pass () >>= fun refs_updated'' -> if refs_updated'' <> 0L then Log.err (fun f -> f "Failed to reach a fixed point after %Ld, %Ld and %Ld \ block moves" refs_updated refs_updated' refs_updated'' ) ; let last_block = get_last_block map in let open Lwt_write_error.Infix in ( if last_block <> start_last_block then ( Log.info (fun f -> f "Shrink file so that last cluster was %s, now %s" (Cluster.to_string start_last_block) (Cluster.to_string last_block) ) ; let p = Physical.make ((Cluster.to_int last_block + 1) lsl t.cluster_bits) in let size_sectors = Physical.sector ~sector_size:t.sector_size p in resize_base t.base t.sector_size (Some (t.cluster_map, t.cluster_bits)) p >>= fun () -> Log.debug (fun f -> f "Resized file to %s clusters (%Ld sectors)" (Cluster.to_string last_block) size_sectors ) ; Lwt.return (Ok ()) ) else Lwt.return (Ok ()) ) >>= fun () -> progress_cb ~percent:100 ; let total_refs_updated = Int64.(add (add refs_updated refs_updated') refs_updated'') in let copied = Int64.(mul total_refs_updated sectors_per_cluster) in (* one ref per block *) let old_size = Int64.mul (Cluster.to_int64 start_last_block) sectors_per_cluster in let new_size = Int64.mul (Cluster.to_int64 last_block) sectors_per_cluster in let report = {refs_updated; copied; old_size; new_size} in if copied <> 0L || total_refs_updated <> 0L then Log.info (fun f -> f "%Ld sectors copied, %Ld references updated, file shrunk \ by %Ld sectors" copied total_refs_updated (Int64.sub old_size new_size) ) ; Lwt.return (Ok report) ) (fun e -> Lwt.return (Error (`Msg (Printexc.to_string e)))) >>= fun result -> Lwt.wakeup u result ; Lwt.return_unit ) ; th (* If a request from the client takes more than ~30s then the client may decide that the storage layer has failed. This could happen if a thread was starved or if there's deadlock, so try to detect it and log something useful. *) let with_deadline t describe_fn nsec f = let open Lwt.Infix in let timeout = Mirage_sleep.ns nsec >>= fun () -> Lwt.return (Error `Timeout) in let work = f () in Lwt.choose [timeout; (work >>= fun x -> Lwt.return (Ok x))] >>= function | Error `Timeout -> Log.err (fun f -> f "%s: I/O deadline exceeded" (describe_fn ())) ; Locks.Debug.dump_state t.locks ; work (* return the answer anyway *) | Ok x -> Lwt.cancel timeout ; Lwt.return x let time_30s = 30_000_000_000L let read t sector bufs = let describe_fn () = Printf.sprintf "read sector = %Ld length = %d" sector (Cstructs.len bufs) in with_deadline t describe_fn time_30s (fun () -> let open Lwt_error.Infix in Counter.inc (Metrics.reads t.config.Config.id) (float_of_int @@ List.fold_left ( + ) 0 @@ List.map Cstruct.length bufs ) ; let sectors_per_cluster = (1 lsl t.cluster_bits) / t.sector_size in let client = Locks.Client.make describe_fn in let cluster_size = 1L <| t.cluster_bits in let byte = Int64.(mul sector (of_int t.info.Mirage_block.sector_size)) in Error.Lwt_error.List.map_p (fun (byte, buf) -> let vaddr = Virtual.make ~cluster_bits:t.cluster_bits byte in ClusterIO.walk_readonly ~client t vaddr >>= function | None -> Cstruct.memset buf 0 ; Lwt.return (Ok None) (* no work to do *) | Some (offset', l1_lock, l2_lock) -> let sector = Physical.sector ~sector_size:t.sector_size offset' in Lwt.return (Ok (Some { sector ; bufs= [buf] ; metadata_locks= [l1_lock; l2_lock] } ) ) ) (chop_into_aligned cluster_size byte bufs) >>= fun work -> let work' = List.rev @@ List.fold_left (fun acc x -> match x with None -> acc | Some y -> y :: acc) [] work in (* work may contain contiguous items *) let work = coalesce_into_adjacent t.sector_size work' in let open Lwt.Infix in iter_p (fun work -> let first = Cluster.of_int64 Int64.(div work.sector (of_int sectors_per_cluster)) in let last_sector = Int64.( add work.sector (of_int (Cstructs.len work.bufs / t.sector_size)) ) in let last_sector' = Int64.(round_up last_sector (of_int sectors_per_cluster)) in let last = Cluster.of_int64 Int64.(div last_sector' (of_int sectors_per_cluster)) in Lwt.finalize (fun () -> Locks.Read.with_locks t.locks ~first ~last (fun () -> Lwt.catch (fun () -> B.read t.base work.sector work.bufs) (fun e -> Log.err (fun f -> f "%s: low-level I/O exception %s" (describe_fn ()) (Printexc.to_string e) ) ; Locks.Debug.dump_state t.locks ; let cluster = Cluster.of_int (Int64.to_int work.sector / sectors_per_cluster) in Qcow_debug.check_references t.metadata t.cluster_map ~cluster_bits:t.cluster_bits cluster >>= fun _ -> Cache.Debug.check_disk t.cache >>= fun _ -> Lwt.fail e ) ) >>= function | Error e -> Lwt.return_error (adapt_error e) | Ok () -> Lwt.return (Ok ()) ) (fun () -> List.iter Locks.unlock work.metadata_locks ; Lwt.return_unit ) ) work >>= fun result -> Locks.Debug.assert_no_locks_held client ; Lwt.return result ) let write t sector bufs = let describe_fn () = Printf.sprintf "write sector = %Ld length = %d" sector (Cstructs.len bufs) in if t.config.Config.read_only then Lwt.return (Error `Is_read_only) else with_deadline t describe_fn time_30s (fun () -> let open Lwt_write_error.Infix in Counter.inc (Metrics.writes t.config.Config.id) (float_of_int @@ List.fold_left ( + ) 0 @@ List.map Cstruct.length bufs ) ; let cluster_size = 1L <| t.cluster_bits in let client = Locks.Client.make describe_fn in let sectors_per_cluster = (1 lsl t.cluster_bits) / t.sector_size in let byte = Int64.(mul sector (of_int t.info.Mirage_block.sector_size)) in Error.Lwt_error.List.map_p (fun (byte, buf) -> let vaddr = Virtual.make ~cluster_bits:t.cluster_bits byte in ClusterIO.walk_readonly ~client t vaddr >>= function | None -> (* Only the first write to this area needs to allocate, so it's ok to make this a little slower *) Lwt.catch (fun () -> ClusterIO.walk_and_allocate ~client t vaddr >>= fun (offset', l1_lock, l2_lock) -> let sector = Physical.sector ~sector_size:t.sector_size offset' in Lwt.return (Ok { sector ; bufs= [buf] ; metadata_locks= [l1_lock; l2_lock] } ) ) (function | Error.Duplicate_reference ((c, w), (c', w'), target) as e -> Log.err (fun f -> f "Duplicate_reference during %s" (describe_fn ()) ) ; Qcow_debug.on_duplicate_reference t.metadata t.cluster_map ~cluster_bits:t.cluster_bits (c, w) (c', w') target >>= fun () -> Lwt.fail e | e -> Lwt.fail e ) | Some (offset', l1_lock, l2_lock) -> let sector = Physical.sector ~sector_size:t.sector_size offset' in Lwt.return (Ok {sector; bufs= [buf]; metadata_locks= [l1_lock; l2_lock]} ) ) (chop_into_aligned cluster_size byte bufs) >>= fun work' -> (let open Lwt.Infix in if !DebugSetting.compact_mid_write then ( Log.debug (fun f -> f "DebugSetting.compact_mid_write") ; compact t () >>= fun _ -> Lwt.return (Ok ()) ) else Lwt.return (Ok ()) ) >>= fun () -> (* work may contain contiguous items *) let work = coalesce_into_adjacent t.sector_size work' in let open Lwt.Infix in iter_p (fun work -> let first = Cluster.of_int64 Int64.(div work.sector (of_int sectors_per_cluster)) in let last_sector = Int64.( add work.sector (of_int (Cstructs.len work.bufs / t.sector_size)) ) in let last_sector' = Int64.(round_up last_sector (of_int sectors_per_cluster)) in let last = Cluster.of_int64 Int64.(div last_sector' (of_int sectors_per_cluster)) in Locks.Write.with_locks ~client t.locks ~first ~last (fun () -> (* Cancel any in-progress move since the data will be stale *) let rec loop n = if n > last then () else ( Qcow_cluster_map.cancel_move t.cluster_map n ; loop (Cluster.succ n) ) in loop first ; Lwt.finalize (fun () -> Lwt.catch (fun () -> B.write t.base work.sector work.bufs >>= function | Error e -> Lwt.return_error (adapt_write_error e) | Ok () -> Lwt.return (Ok ()) ) (fun e -> Log.err (fun f -> f "%s: low-level I/O exception %s" (describe_fn ()) (Printexc.to_string e) ) ; Locks.Debug.dump_state t.locks ; let cluster = Cluster.of_int (Int64.to_int work.sector / sectors_per_cluster) in Qcow_debug.check_references t.metadata t.cluster_map ~cluster_bits:t.cluster_bits cluster >>= fun _ -> Cache.Debug.check_disk t.cache >>= fun _ -> Lwt.fail e ) ) (fun () -> List.iter Locks.unlock work.metadata_locks ; Lwt.return_unit ) ) ) work >>= fun result -> Locks.Debug.assert_no_locks_held client ; Lwt.return result ) let seek_mapped t from = let open Lwt_error.Infix in let bytes = Int64.(mul from (of_int t.sector_size)) in let int64s_per_cluster = 1L <| Int32.to_int t.h.Header.cluster_bits - 3 in let rec scan_l1 a = if a.Virtual.l1_index >= Int64.of_int32 t.h.Header.l1_size then Lwt.return (Ok Int64.(mul t.info.Mirage_block.size_sectors (of_int t.sector_size)) ) else ClusterIO.find_mapped_l1_table t a.Virtual.l1_index >>= function | None -> Lwt.return (Ok Int64.( mul t.info.Mirage_block.size_sectors (of_int t.sector_size) ) ) | Some l1_index -> let a = {a with Virtual.l1_index} in ClusterIO.read_l1_table t a.Virtual.l1_index >>= fun (x, l1_lock) -> Locks.unlock l1_lock ; if Physical.to_bytes x = 0 then scan_l1 { a with Virtual.l1_index= Int64.succ a.Virtual.l1_index ; l2_index= 0L } else let rec scan_l2 a = if a.Virtual.l2_index >= int64s_per_cluster then scan_l1 { a with Virtual.l1_index= Int64.succ a.Virtual.l1_index ; l2_index= 0L } else ClusterIO.read_l2_table t x a.Virtual.l2_index >>= fun (x, l2_lock) -> Locks.unlock l2_lock ; if Physical.to_bytes x = 0 then scan_l2 {a with Virtual.l2_index= Int64.succ a.Virtual.l2_index} else Lwt.return (Ok (Qcow_virtual.to_offset ~cluster_bits:t.cluster_bits a) ) in scan_l2 a in scan_l1 (Virtual.make ~cluster_bits:t.cluster_bits bytes) >>= fun offset -> let x = Int64.(div offset (of_int t.sector_size)) in assert (x >= from) ; Lwt.return (Ok x) let seek_unmapped t from = let open Lwt_error.Infix in let bytes = Int64.(mul from (of_int t.sector_size)) in let int64s_per_cluster = 1L <| Int32.to_int t.h.Header.cluster_bits - 3 in let rec scan_l1 a = if a.Virtual.l1_index >= Int64.of_int32 t.h.Header.l1_size then Lwt.return (Ok Int64.(mul t.info.Mirage_block.size_sectors (of_int t.sector_size)) ) else ClusterIO.read_l1_table t a.Virtual.l1_index >>= fun (x, l1_lock) -> Locks.unlock l1_lock ; if Physical.to_bytes x = 0 then Lwt.return (Ok (Qcow_virtual.to_offset ~cluster_bits:t.cluster_bits a)) else let rec scan_l2 a = if a.Virtual.l2_index >= int64s_per_cluster then scan_l1 { a with Virtual.l1_index= Int64.succ a.Virtual.l1_index ; l2_index= 0L } else ClusterIO.read_l2_table t x a.Virtual.l2_index >>= fun (y, l2_lock) -> Locks.unlock l2_lock ; if Physical.to_bytes y = 0 then Lwt.return (Ok (Qcow_virtual.to_offset ~cluster_bits:t.cluster_bits a)) else scan_l2 {a with Virtual.l2_index= Int64.succ a.Virtual.l2_index} in scan_l2 a in scan_l1 (Virtual.make ~cluster_bits:t.cluster_bits bytes) >>= fun offset -> let x = Int64.(div offset (of_int t.sector_size)) in assert (x >= from) ; Lwt.return (Ok x) let disconnect t = B.disconnect t.base let make config base h = let open Lwt in B.get_info base >>= fun base_info -> (* The virtual disk has 512 byte sectors *) let info' = { Mirage_block.read_write= false ; sector_size= 512 ; size_sectors= Int64.(div h.Header.size 512L) } in (* We assume the backing device is resized dynamically so the size is the address of the next cluster *) let sector_size = base_info.Mirage_block.sector_size in let cluster_bits = Int32.to_int h.Header.cluster_bits in (* The first cluster is allocated after the L1 table *) let cluster_size = 1L <| cluster_bits in (* qemu-img will allocate a cluster by writing only a single sector to the end of the file. We insist that the file is a whole number of clusters in size. *) let sectors_per_cluster = Int64.(div (1L <| cluster_bits) (of_int sector_size)) in let new_size_sectors = Int64.round_up base_info.Mirage_block.size_sectors sectors_per_cluster in ( if new_size_sectors > base_info.Mirage_block.size_sectors then ( Log.info (fun f -> f "rounding up file to a whole number of clusters (= %Ld sectors)" new_size_sectors ) ; B.resize base new_size_sectors >>= function | Error _ -> Lwt.fail_with "resizing file" | Ok () -> Lwt.return_unit ) else Lwt.return_unit ) >>= fun () -> let locks = Locks.make () in let read_cluster i = let buf = malloc h in let cluster = Cluster.to_int64 i in let offset = cluster <| cluster_bits in let sector = Int64.(div offset (of_int sector_size)) in let open Lwt.Infix in Lwt.catch (fun () -> B.read base sector [buf] >>= function | Error _ -> Lwt.fail_with "unknown error" | Ok () -> Lwt.return (Ok buf) ) (fun e -> Log.err (fun f -> f "read_cluster %Ld: low-level I/O exception %s" cluster (Printexc.to_string e) ) ; Locks.Debug.dump_state locks ; Lwt.fail e ) in let write_cluster i buf = if config.Config.read_only then Lwt.return (Error `Is_read_only) else let cluster = Cluster.to_int64 i in let offset = cluster <| cluster_bits in let sector = Int64.(div offset (of_int sector_size)) in Lwt.catch (fun () -> B.write base sector [buf] >>= function | Error `Disconnected -> Lwt.return (Error `Disconnected) | Error `Is_read_only -> Lwt.return (Error `Is_read_only) | Error _ -> Lwt.fail_with "unknown error" | Ok () -> Lwt.return (Ok ()) ) (fun e -> Log.err (fun f -> f "write_cluster %Ld: low-level I/O exception %s" cluster (Printexc.to_string e) ) ; Locks.Debug.dump_state locks ; Lwt.fail e ) in let cache = Cache.create ~read_cluster ~write_cluster () in let metadata = Metadata.make ~cache ~cluster_bits ~locks () in let recycler = Recycler.create ~base ~sector_size ~cluster_bits ~cache ~locks ~metadata ~runtime_asserts:config.Config.runtime_asserts in let lazy_refcounts = match h.Header.additional with | Some {Header.lazy_refcounts= true; _} -> true | _ -> false in let stats = Stats.zero in let cluster_map = Qcow_cluster_map.zero in let cluster_map_m = Lwt_mutex.create () in let t' = { h ; base ; info= info' ; config ; locks ; recycler ; metadata ; cache ; sector_size ; cluster_bits ; lazy_refcounts ; stats ; cluster_map ; cluster_map_m } in Lwt_error.or_fail_with @@ make_cluster_map t' ~id:config.Config.id () >>= fun cluster_map -> if config.Config.runtime_asserts then Qcow_cluster_map.Debug.assert_equal cluster_map cluster_map ; (* An opened file may have junk at the end, which means that we would simultaneously allocate from it (get_last_block + n) as well as erase and recycle it. We should trim the file now so that it is safe to allocate from it as normal. Normally when the file is expanded the blocks at the end are not considered to be junk. *) let last_block = Qcow_cluster_map.get_last_block cluster_map in let size_clusters = Cluster.succ last_block in let p = Physical.make (Cluster.to_int size_clusters lsl cluster_bits) in let size_sectors = Physical.sector ~sector_size p in ( if config.Config.read_only then Lwt.return_unit else Lwt_write_error.or_fail_with @@ resize_base base sector_size None p >>= fun () -> Log.info (fun f -> f "Resized file to %s clusters (%Ld sectors)" (Cluster.to_string size_clusters) size_sectors ) ; Qcow_cluster_map.resize cluster_map size_clusters ; Lwt.return_unit ) >>= fun () -> t'.cluster_map <- cluster_map ; Metadata.set_cluster_map t'.metadata cluster_map ; Recycler.set_cluster_map t'.recycler cluster_map ; if config.Config.read_only then Lwt.return t' else ( ( match config.Config.keep_erased with | None -> () | Some sectors -> let keep_erased = Int64.(div (mul sectors (of_int sector_size)) cluster_size) in let compact_after_unmaps = match config.Config.compact_after_unmaps with | None -> None | Some sectors -> Some Int64.(div (mul sectors (of_int sector_size)) cluster_size) in Recycler.start_background_thread t'.recycler ~keep_erased ?compact_after_unmaps () ) ; ( if config.Config.discard && not lazy_refcounts then ( Log.info (fun f -> f "discard requested and lazy_refcounts is disabled: erasing \ refcount table and enabling lazy_refcounts" ) ; Lwt_error.or_fail_with @@ ClusterIO.Refcount.zero_all t' >>= fun () -> let additional = match h.Header.additional with | Some h -> {h with Header.lazy_refcounts= true} | None -> { Header.dirty= true ; corrupt= false ; lazy_refcounts= true ; autoclear_features= 0L ; refcount_order= 4l } in let extensions = [`Feature_name_table Header.Feature.understood] in let h = {h with Header.additional= Some additional; extensions} in Lwt_write_error.or_fail_with @@ update_header t' h >>= fun () -> t'.lazy_refcounts <- true ; Lwt.return_unit ) else Lwt.return_unit ) >>= fun () -> Recycler.flush t'.recycler >>= function | Error _ -> Log.err (fun f -> f "initial flush failed") ; Lwt.fail (Failure "initial flush failed") | Ok () -> Lwt.return t' ) let connect ?(config = Config.default ()) base = let open Lwt.Infix in B.get_info base >>= fun base_info -> let sector = Cstruct.sub Io_page.(to_cstruct (get 1)) 0 base_info.Mirage_block.sector_size in B.read base 0L [sector] >>= function | Error e -> Format.kasprintf Lwt.fail_with "%a" B.pp_error e | Ok () -> ( match Header.read sector with | Error (`Msg m) -> Lwt.fail_with m | Ok (h, _) -> make config base h >>= fun t -> let open Qcow_cluster_map in let free = total_free t.cluster_map in let used = total_used t.cluster_map in Log.info (fun f -> f "image has %Ld free sectors and %Ld used sectors" free used ) ; Lwt.return t ) let check base = let open Lwt.Infix in let open Qcow_cluster_map in Lwt.catch (fun () -> let config = Config.create ~read_only:true () in connect ~config base >>= fun t -> let free = total_free t.cluster_map in let used = total_used t.cluster_map in Lwt.return (Ok {free; used}) ) (function | Reference_outside_file (src, dst) -> Lwt.return (Error (`Reference_outside_file (src, dst))) | Error.Duplicate_reference ((c, w), (c', w'), dst) -> Lwt.return (Error (`Duplicate_reference ((c, w), (c', w'), dst))) | e -> Lwt.fail e ) let resize t ~new_size:requested_size_bytes ?(ignore_data_loss = false) () = if t.config.Config.read_only then Lwt.return (Error `Is_read_only) else let existing_size = t.h.Header.size in if existing_size > requested_size_bytes && not ignore_data_loss then Lwt.return (Error (`Msg (Printf.sprintf "Requested resize would result in data loss: requested size \ = %Ld but current size = %Ld" requested_size_bytes existing_size ) ) ) else let size = Int64.round_up requested_size_bytes 512L in let l2_tables_required = Header.l2_tables_required ~cluster_bits:t.cluster_bits size in (* Keep it simple for now by refusing resizes which would require us to reallocate the L1 table. *) let l2_entries_per_cluster = 1L <| Int32.to_int t.h.Header.cluster_bits - 3 in let old_max_entries = Int64.round_up (Int64.of_int32 t.h.Header.l1_size) l2_entries_per_cluster in let new_max_entries = Int64.round_up l2_tables_required l2_entries_per_cluster in if new_max_entries > old_max_entries then Lwt.return (Error (`Msg "I don't know how to resize in the case where the L1 table \ needs new clusters:" ) ) else update_header t {t.h with Header.l1_size= Int64.to_int32 l2_tables_required; size} let zero = let page = Io_page.(to_cstruct (get 1)) in Cstruct.memset page 0 ; page let rec erase t ~sector ~n () = let open Lwt_write_error.Infix in if n <= 0L then Lwt.return (Ok ()) else (* This could walk one cluster at a time instead of one sector at a time *) let byte = Int64.(mul sector (of_int t.info.Mirage_block.sector_size)) in let vaddr = Virtual.make ~cluster_bits:t.cluster_bits byte in (ClusterIO.walk_readonly t vaddr >>= function | None -> (* Already zero, nothing to do *) Lwt.return (Ok ()) | Some (offset', l1_lock, l2_lock) -> Lwt.finalize (fun () -> let base_sector, _ = Physical.to_sector ~sector_size:t.sector_size offset' in t.stats.Stats.nr_erased <- Int64.succ t.stats.Stats.nr_erased ; let open Lwt.Infix in B.write t.base base_sector [Cstruct.sub zero 0 t.info.Mirage_block.sector_size] >>= adapt_write_error_result ) (fun () -> Locks.unlock l1_lock ; Locks.unlock l2_lock ; Lwt.return_unit ) ) >>= fun () -> erase t ~sector:(Int64.succ sector) ~n:(Int64.pred n) () let discard t ~sector ~n () = let describe_fn () = Printf.sprintf "discard sector %Ld n %Ld" sector n in with_deadline t describe_fn time_30s (fun () -> let open Lwt_write_error.Infix in ( if not t.config.Config.discard then ( Log.err (fun f -> f "discard called but feature not implemented in configuration" ) ; Lwt.fail (Failure "Unimplemented") ) else Lwt.return (Ok ()) ) >>= fun () -> Counter.inc (Metrics.discards t.config.Config.id) Int64.(to_float @@ mul n @@ of_int t.sector_size) ; let client = Locks.Client.make describe_fn in (* we can only discard whole clusters. We will explicitly zero non-cluster aligned discards in order to satisfy RZAT *) (* round sector, n up to a cluster boundary *) let sectors_per_cluster = Int64.(div (1L <| t.cluster_bits) (of_int t.sector_size)) in let sector' = Int64.round_up sector sectors_per_cluster in (* we can only discard whole clusters. We will explicitly zero non-cluster aligned discards in order to satisfy RZAT *) let to_erase = min n (Int64.sub sector' sector) in erase t ~sector ~n:to_erase () >>= fun () -> let n' = Int64.sub n to_erase in let to_discard = Int64.round_down n' sectors_per_cluster in ClusterIO.walk_and_deallocate ~client t sector' to_discard >>= fun () -> erase t ~sector:(Int64.add sector' to_discard) ~n:(Int64.sub n' to_discard) () ) let create base ~size ?(lazy_refcounts = true) ?(cluster_bits = 16) ?(config = Config.default ()) () = let version = `Three in let backing_file_offset = 0L in let backing_file_size = 0l in let cluster_size = 1 lsl cluster_bits in let crypt_method = `None in (* qemu-img places the refcount table next in the file and only qemu-img creates a tiny refcount table and grows it on demand *) let refcount_table_offset = Physical.make cluster_size in let refcount_table_clusters = 1 in (* qemu-img places the L1 table after the refcount table *) let l1_table_offset = Physical.make ((refcount_table_clusters + 1) lsl cluster_bits) in let l2_tables_required = Header.l2_tables_required ~cluster_bits size in let nb_snapshots = 0l in let snapshots_offset = 0L in let additional = Some { Header.dirty= lazy_refcounts ; corrupt= false ; lazy_refcounts ; autoclear_features= 0L ; refcount_order= 4l } in let extensions = [`Feature_name_table Header.Feature.understood] in let h = { Header.version ; backing_file_offset ; backing_file_size ; cluster_bits= Int32.of_int cluster_bits ; size ; crypt_method ; l1_size= Int64.to_int32 l2_tables_required ; l1_table_offset ; refcount_table_offset ; refcount_table_clusters= Int32.of_int refcount_table_clusters ; nb_snapshots ; snapshots_offset ; additional ; extensions } in (* Resize the underlying device to contain the header + refcount table + l1 table. Future allocations will enlarge the file. *) let l1_size_bytes = 8 * Int64.to_int l2_tables_required in let next_free_byte = Int.round_up (Physical.to_bytes l1_table_offset + l1_size_bytes) cluster_size in let open Lwt in B.get_info base >>= fun base_info -> (* Erase existing contents *) let open Lwt_write_error.Infix in resize_base base base_info.Mirage_block.sector_size None (Physical.make 0) >>= fun () -> let p = Physical.make next_free_byte in resize_base base base_info.Mirage_block.sector_size None p >>= fun () -> let open Lwt.Infix in make config base h >>= fun t -> let open Lwt_write_error.Infix in update_header t h >>= fun () -> (* Write an initial empty refcount table *) let cluster = malloc t.h in Cstruct.memset cluster 0 ; let open Lwt.Infix in B.write base (Physical.sector ~sector_size:t.sector_size refcount_table_offset) [cluster] >>= function | Error e -> Lwt.return_error (adapt_write_error e) | Ok () -> ( let open Lwt_write_error.Infix in let next_cluster = next_free_byte / cluster_size in let rec loop limit i = if i = limit then Lwt.return (Ok ()) else ClusterIO.Refcount.incr t (Cluster.of_int i) >>= fun () -> loop limit (i + 1) in (* Increase the refcount of all header clusters i.e. those < next_free_cluster *) loop next_cluster 0 >>= fun () -> (* Write an initial empty L1 table *) let open Lwt.Infix in B.write base (Physical.sector ~sector_size:t.sector_size l1_table_offset) [cluster] >>= function | Error e -> Lwt.return_error (adapt_write_error e) | Ok () -> ( Recycler.flush t.recycler >>= function | Error e -> Lwt.return_error (adapt_write_error e) | Ok () -> Lwt.return (Ok t) ) ) let rebuild_refcount_table t = let open Lwt_write_error.Infix in let client = Locks.Client.make (fun () -> "rebuild_refcount_table") in (* Disable lazy refcounts so we actually update the real refcounts *) let lazy_refcounts = t.lazy_refcounts in t.lazy_refcounts <- false ; Log.info (fun f -> f "Zeroing existing refcount table") ; ClusterIO.Refcount.zero_all ~client t >>= fun () -> let cluster = Physical.cluster ~cluster_bits:t.cluster_bits t.h.Header.refcount_table_offset in let refcount_table_clusters = Int32.to_int t.h.Header.refcount_table_clusters in let rec loop i = if i >= refcount_table_clusters then Lwt.return (Ok ()) else ClusterIO.Refcount.incr ~client t Cluster.(add cluster (of_int i)) >>= fun () -> (* If any of the table entries point to a block, increase its refcount too *) Metadata.read ~client t.metadata Cluster.(add cluster (of_int i)) (fun c -> let addresses = Metadata.Physical.of_contents c in Lwt.return (Ok addresses) ) >>= fun addresses -> let rec inner i = if i >= Metadata.Physical.len addresses then Lwt.return (Ok ()) else let addr = Metadata.Physical.get addresses i in ( if addr <> Physical.unmapped then ( let cluster' = Physical.cluster ~cluster_bits:t.cluster_bits addr in Log.debug (fun f -> f "Refcount cluster %s has reference to cluster %s" (Cluster.to_string cluster) (Cluster.to_string cluster') ) ; (* It might have been incremented already by a previous `incr` *) ClusterIO.Refcount.read ~client t cluster' >>= function | 0 -> ClusterIO.Refcount.incr ~client t cluster' | _ -> Lwt.return (Ok ()) ) else Lwt.return (Ok ()) ) >>= fun () -> inner (i + 1) in inner 0 >>= fun () -> loop (i + 1) in Log.info (fun f -> f "Incrementing refcount of the refcount table clusters") ; loop 0 >>= fun () -> (* Increment the refcount of the header and L1 table *) Log.info (fun f -> f "Incrementing refcount of the header") ; ClusterIO.Refcount.incr ~client t Cluster.zero >>= fun () -> let l1_table_clusters = let refs_per_cluster = 1L <| t.cluster_bits - 3 in Int64.( to_int @@ div (round_up (of_int32 t.h.Header.l1_size) refs_per_cluster) refs_per_cluster ) in let l1_table_cluster = Physical.cluster ~cluster_bits:t.cluster_bits t.h.Header.l1_table_offset in let rec loop i = if i >= l1_table_clusters then Lwt.return (Ok ()) else ClusterIO.Refcount.incr ~client t Cluster.(add l1_table_cluster (of_int i)) >>= fun () -> (* Increment clusters of L1 tables *) Metadata.read ~client t.metadata Cluster.(add l1_table_cluster (of_int i)) (fun c -> let addresses = Metadata.Physical.of_contents c in Lwt.return (Ok addresses) ) >>= fun addresses -> let rec inner i = if i >= Metadata.Physical.len addresses then Lwt.return (Ok ()) else let addr = Metadata.Physical.get addresses i in ( if addr <> Physical.unmapped then ( let cluster' = Physical.cluster ~cluster_bits:t.cluster_bits addr in Log.debug (fun f -> f "L1 cluster %s has reference to L2 cluster %s" (Cluster.to_string cluster) (Cluster.to_string cluster') ) ; ClusterIO.Refcount.incr ~client t cluster' ) else Lwt.return (Ok ()) ) >>= fun () -> inner (i + 1) in inner 0 >>= fun () -> loop (i + 1) in Log.info (fun f -> f "Incrementing refcount of the %Ls L1 table clusters starting at %s" l1_table_clusters (Cluster.to_string l1_table_cluster) ) ; loop 0 >>= fun () -> (* Fold over the mapped data, incrementing refcounts along the way *) let sectors_per_cluster = Int64.(div (1L <| t.cluster_bits) (of_int t.sector_size)) in let rec loop sector = if sector >= t.info.Mirage_block.size_sectors then Lwt.return (Ok ()) else seek_mapped t sector >>= fun mapped_sector -> if mapped_sector <> sector then loop mapped_sector else ClusterIO.walk_readonly ~client t (Virtual.make ~cluster_bits:t.cluster_bits Int64.(mul (of_int t.info.Mirage_block.sector_size) mapped_sector) ) >>= function | None -> assert false | Some (offset', l1_lock, l2_lock) -> Locks.unlock l1_lock ; Locks.unlock l2_lock ; let cluster = Physical.cluster ~cluster_bits:t.cluster_bits offset' in ClusterIO.Refcount.incr ~client t cluster >>= fun () -> loop (Int64.add mapped_sector sectors_per_cluster) in Log.info (fun f -> f "Incrementing refcount of the data clusters") ; loop 0L >>= fun () -> (* Restore the original lazy_refcount setting *) t.lazy_refcounts <- lazy_refcounts ; Lwt.return (Ok ()) let flush t = let open Lwt.Infix in Recycler.flush t.recycler >>= adapt_write_error_result let header t = t.h module Debug = struct let check_no_overlaps t = let within = Physical.within_cluster ~cluster_bits:t.cluster_bits t.h.Header.l1_table_offset in assert (within = 0) ; let within = Physical.within_cluster ~cluster_bits:t.cluster_bits t.h.Header.refcount_table_offset in assert (within = 0) ; Lwt.return (Ok ()) let assert_no_leaked_blocks t = Qcow_cluster_map.Debug.assert_no_leaked_blocks t.cluster_map let assert_cluster_map_in_sync t = let open Lwt.Infix in Lwt_error.or_fail_with @@ make_cluster_map t () >>= fun cluster_map -> Qcow_cluster_map.Debug.assert_equal cluster_map t.cluster_map ; Lwt.return_unit module Setting = DebugSetting let metadata_blocks t = let clusters = Qcow_cluster_map.Debug.metadata_blocks t.cluster_map in Qcow_types.Cluster.( IntervalSet.( fold (fun i acc -> let x, y = Interval. ( to_int64 (x i) <| t.cluster_bits , (* the last inclusive byte = next cluster start - 1 *) Int64.pred (to_int64 (succ @@ y i) <| t.cluster_bits) ) in Qcow_types.Int64.IntervalSet.(add (Interval.make x y) acc) ) ) ) clusters Qcow_types.Int64.IntervalSet.empty end end