include module type of struct include Astring end
String
val (^) : string -> string -> string
Characters (bytes in fact).
Strings, substrings, string sets and maps.
Differences with the OCaml String
module
First note that it is not a goal of Astring
to maintain compatibility with the OCaml String
module.
In Astring
:
- Strings are assumed to be immutable.
- Deprecated functions are not included.
- Some rarely used functions are dropped, some signatures and names are altered, a few often needed functions are added.
- Scanning functions are not doubled for supporting forward and reverse directions. Both directions are supported via a single function and an optional
rev
argument. - Functions do not raise
Not_found
. They return option
values instead. - Functions escaping bytes to printable US-ASCII characters use capital hexadecimal escapes rather than decimal ones.
- US-ASCII string support is collected in the
Char.Ascii
and String.Ascii
submodules. The functions make sure to operate only on the US-ASCII code points (rather than ISO/IEC 8859-1 code points). This means they can safely be used on UTF-8 encoded strings, they will of course only deal with the US-ASCII subset U+0000 to U+007F of Unicode scalar values. - The module has pre-applied exception safe
String.Set
and String.Map
submodules.
Porting guide
Opening Astring
at the top of a module that uses the OCaml standard library in a project that compiles with -safe-string
will either result in typing errors or compatible behaviour except for uses of the String.trim
function, see below.
If for some reason you can't compile your project with -safe-string
this may not be a problem. However you have to make sure that your code does not depend on fresh strings being returned by functions of the String
module. The functions of Astring.String
assume strings to be immutable and thus do not always allocate fresh strings for their results. This is the case for example for the (^)
operator redefinition: no string is allocated whenever one of its arguments is an empty string. That being said it is still better to first make your project compile with -safe-string
and then port to Astring
.
The String.sub
function is renamed to String.with_range
. If you are working with String.find
you may find it easier to use String.with_index_range
which takes indices as arguments and is thus directly usable with the result of String.find
. But in general index based string processing should be frowned upon and replaced by substring extraction combinators.
Porting String.trim
usages
The standard OCaml String.trim
function only trims the characters ' '
, '\t'
, '\n'
, '\012'
, '\r'
. In Astring
the default set adds vertical tab (0x0B
) to the set to match the behaviour of the C isspace(3)
function.
If you want to preserve the behaviour of the original function you can replace any use of String.trim
with the following std_ocaml_trim
function:
let std_ocaml_trim s =
let drop = function
| ' ' | '\n' | '\012' | '\r' | '\t' -> true
| _ -> false
in
String.trim ~drop s
Examples
We show how to use substrings to quickly devise LL(1) parsers. To keep it simple we do not implement precise error report, but note that it would be easy to add it by replacing the raise Exit
calls by an exception with more information: we have everything at hand at these call points to report good error messages.
The first example parses version numbers structured as follows:
[v|V]major.minor[.patch][(+|-)info]
an unreadable Str
regular expression for this would be:
"[vV]?\\([0-9]+\\)\\.\\([0-9]+\\)\\(\\.\\([0-9]+\\)\\)?\\([+-]\\(.*\\)\\)?"
Using substrings is certainly less terse but note that the parser is made of reusable sub-functions.
let parse_version : string -> (int * int * int * string option) option =
fun s -> try
let parse_opt_v s = match String.Sub.head s with
| Some ('v'|'V') -> String.Sub.tail s
| Some _ -> s
| None -> raise Exit
in
let parse_dot s = match String.Sub.head s with
| Some '.' -> String.Sub.tail s
| Some _ | None -> raise Exit
in
let parse_int s =
match String.Sub.span ~min:1 ~sat:Char.Ascii.is_digit s with
| (i, _) when String.Sub.is_empty i -> raise Exit
| (i, s) ->
match String.Sub.to_int i with
| None -> raise Exit | Some i -> i, s
in
let maj, s = parse_int (parse_opt_v (String.sub s)) in
let min, s = parse_int (parse_dot s) in
let patch, s = match String.Sub.head s with
| Some '.' -> parse_int (parse_dot s)
| _ -> 0, s
in
let info = match String.Sub.head s with
| Some ('+' | '-') -> Some (String.Sub.(to_string (tail s)))
| Some _ -> raise Exit
| None -> None
in
Some (maj, min, patch, info)
with Exit -> None
The second example parses space separated key-value bindings environments of the form:
key0 = value0 key2 = value2 ...
To support values with spaces, values can be quoted between two '"'
characters. If they are quoted then any "\\\""
subsequence (0x2F
,0x22
) is interpreted as the character '"'
(0x22
) and "\\\\"
(0x2F
,0x2F
) is interpreted as the character '\\'
(0x22
).
let parse_env : string -> string String.map option =
fun s -> try
let skip_white s = String.Sub.drop ~sat:Char.Ascii.is_white s in
let parse_key s =
let id_char c = Char.Ascii.is_letter c || c = '_' in
match String.Sub.span ~min:1 ~sat:id_char s with
| (key, _) when String.Sub.is_empty key -> raise Exit
| (key, rem) -> (String.Sub.to_string key), rem
in
let parse_eq s = match String.Sub.head s with
| Some '=' -> String.Sub.tail s
| Some _ | None -> raise Exit
in
let parse_value s = match String.Sub.head s with
| Some '"' -> (* quoted *)
let is_data = function '\\' | '"' -> false | _ -> true in
let rec loop acc s =
let data, rem = String.Sub.span ~sat:is_data s in
match String.Sub.head rem with
| Some '"' ->
let acc = List.rev (data :: acc) in
String.Sub.(to_string @@ concat acc), (String.Sub.tail rem)
| Some '\\' ->
let rem = String.Sub.tail rem in
begin match String.Sub.head rem with
| Some ('"' | '\\' as c) ->
let acc = String.(sub (of_char c)) :: data :: acc in
loop acc (String.Sub.tail rem)
| Some _ | None -> raise Exit
end
| None | Some _ -> raise Exit
in
loop [] (String.Sub.tail s)
| Some _ ->
let is_data c = not (Char.Ascii.is_white c) in
let data, rem = String.Sub.span ~sat:is_data s in
String.Sub.to_string data, rem
| None -> "", s
in
let rec parse_bindings acc s =
if String.Sub.is_empty s then acc else
let key, s = parse_key s in
let value, s = s |> skip_white |> parse_eq |> skip_white |> parse_value in
parse_bindings (String.Map.add key value acc) (skip_white s)
in
Some (String.sub s |> skip_white |> parse_bindings String.Map.empty)
with Exit -> None