common/mlpcre: Add split and nsplit functions.

These work like our String.split and String.nsplit functions.
This commit is contained in:
Richard W.M. Jones
2017-09-21 19:12:06 +01:00
parent 8bd5933cc7
commit 45ef3545d4
3 changed files with 81 additions and 0 deletions

View File

@@ -52,5 +52,38 @@ let rec replace ?(global = false) patt subst subj =
xs ^ subst ^ zs
)
let rec split patt subj =
if not (matches patt subj) then
subj, ""
else (
(* If patt matches "yyyy" in the original string then we have
* the following situation, where "xxxx" is the part of the
* original string before the match, and "zzzz..." is the
* part after the match:
* "xxxxyyyyzzzzzzzzzzzzz"
* ^ ^
* i1 i2
*)
let i1, i2 = subi 0 in
let xs = String.sub subj 0 i1 (* "xxxx", part before the match *) in
let zs = String.sub subj i2 (String.length subj - i2) (* after *) in
xs, zs
)
and nsplit ?(max = 0) patt subj =
if max < 0 then
invalid_arg "PCRE.nsplit: max parameter should not be negative";
(* If we reached the limit, OR if the pattern does not match the string
* at all, return the rest of the string as a single element list.
*)
if max = 1 || not (matches patt subj) then
[subj]
else (
let s1, s2 = split patt subj in
let max = if max = 0 then 0 else max - 1 in
s1 :: nsplit ~max patt s2
)
let () =
Callback.register_exception "PCRE.Error" (Error ("", 0))

View File

@@ -110,3 +110,22 @@ val replace : ?global:bool -> regexp -> string -> string -> string
Note that this function does not allow backreferences.
Any captures in [patt] are ignored. *)
val split : regexp -> string -> string * string
val nsplit : ?max:int -> regexp -> string -> string list
(** [split patt subj] splits the string at the first occurrence
of the regular expression [patt], returning the parts of the
string before and after the match (the matching part is not
returned). If the pattern does not match then the whole
input is returned in the first string, and the second string
is empty.
[nsplit patt subj] is the same but the string is split
on every occurrence of [patt]. Note that if the pattern
matches at the beginning or end of the string, then an
empty string element will be returned at the beginning or
end of the list.
[nsplit] has an optional [?max] parameter which controls
the maximum length of the returned list. The final element
contains the remainder of the string. *)

View File

@@ -42,6 +42,20 @@ let replace ?(global = false) patt subst subj =
eprintf " %s\n%!" r;
r
let split patt subj =
eprintf "PCRE.split <patt> %s ->%!" subj;
let s1, s2 = PCRE.split patt subj in
eprintf " (%s, %s)\n%!" s1 s2;
(s1, s2)
let nsplit ?(max = 0) patt subj =
eprintf "PCRE.nsplit%s <patt> %s ->%!"
(if max = 0 then "" else sprintf " ~max:%d" max)
subj;
let ss = PCRE.nsplit ~max patt subj in
eprintf " [%s]\n%!" (String.concat "; " ss);
ss
let sub i =
eprintf "PCRE.sub %d ->%!" i;
let r = PCRE.sub i in
@@ -60,6 +74,7 @@ let () =
let re1 = compile "(a+)b" in
let re2 = compile "(a+)(b*)" in
let re3 = compile ~caseless:true "[^a-z0-9_]" in
let ws = compile "\\s+" in
assert (matches re0 "ccaaabbbb" = true);
assert (sub 0 = "aaab");
@@ -101,6 +116,20 @@ let () =
assert (replace ~global:true re3 "-" "this is a\xc2\xa3FUNNY.name?"
(* = "this-is-a-FUNNY-name-" if UTF-8 worked *)
= "this-is-a--FUNNY-name-");
(* This also tests PCRE.split since that is used by nsplit. *)
assert (nsplit ~max:1 ws "a b c" = [ "a b c" ]);
assert (nsplit ~max:2 ws "a b c" = [ "a"; "b c" ]);
assert (nsplit ~max:3 ws "a b c" = [ "a"; "b"; "c" ]);
assert (nsplit ~max:10 ws "a b c" = [ "a"; "b"; "c" ]);
assert (nsplit ws "the cat sat on \t\t the mat." =
[ "the"; "cat"; "sat"; "on"; "the"; "mat." ]);
assert (nsplit ~max:5 ws "the cat sat on \t\t the mat." =
[ "the"; "cat"; "sat"; "on"; "the mat." ]);
assert (nsplit ws " the " = [ ""; "the"; "" ]);
assert (nsplit ws "the " = [ "the"; "" ]);
assert (nsplit ws " the" = [ ""; "the" ]);
assert (nsplit ws " \t the" = [ ""; "the" ]);
with
| Not_found ->
failwith "one of the PCRE.sub functions unexpectedly raised Not_found"