diff --git a/common/mlpcre/PCRE.ml b/common/mlpcre/PCRE.ml index 753e247e4..b054928f9 100644 --- a/common/mlpcre/PCRE.ml +++ b/common/mlpcre/PCRE.ml @@ -52,5 +52,38 @@ let rec replace ?(global = false) patt subst subj = xs ^ subst ^ zs ) +let rec split patt subj = + if not (matches patt subj) then + subj, "" + else ( + (* If patt matches "yyyy" in the original string then we have + * the following situation, where "xxxx" is the part of the + * original string before the match, and "zzzz..." is the + * part after the match: + * "xxxxyyyyzzzzzzzzzzzzz" + * ^ ^ + * i1 i2 + *) + let i1, i2 = subi 0 in + let xs = String.sub subj 0 i1 (* "xxxx", part before the match *) in + let zs = String.sub subj i2 (String.length subj - i2) (* after *) in + xs, zs + ) + +and nsplit ?(max = 0) patt subj = + if max < 0 then + invalid_arg "PCRE.nsplit: max parameter should not be negative"; + + (* If we reached the limit, OR if the pattern does not match the string + * at all, return the rest of the string as a single element list. + *) + if max = 1 || not (matches patt subj) then + [subj] + else ( + let s1, s2 = split patt subj in + let max = if max = 0 then 0 else max - 1 in + s1 :: nsplit ~max patt s2 + ) + let () = Callback.register_exception "PCRE.Error" (Error ("", 0)) diff --git a/common/mlpcre/PCRE.mli b/common/mlpcre/PCRE.mli index fcf6fd25e..eacb6fd90 100644 --- a/common/mlpcre/PCRE.mli +++ b/common/mlpcre/PCRE.mli @@ -110,3 +110,22 @@ val replace : ?global:bool -> regexp -> string -> string -> string Note that this function does not allow backreferences. Any captures in [patt] are ignored. *) + +val split : regexp -> string -> string * string +val nsplit : ?max:int -> regexp -> string -> string list +(** [split patt subj] splits the string at the first occurrence + of the regular expression [patt], returning the parts of the + string before and after the match (the matching part is not + returned). If the pattern does not match then the whole + input is returned in the first string, and the second string + is empty. + + [nsplit patt subj] is the same but the string is split + on every occurrence of [patt]. Note that if the pattern + matches at the beginning or end of the string, then an + empty string element will be returned at the beginning or + end of the list. + + [nsplit] has an optional [?max] parameter which controls + the maximum length of the returned list. The final element + contains the remainder of the string. *) diff --git a/common/mlpcre/pcre_tests.ml b/common/mlpcre/pcre_tests.ml index 9d42914b9..346019c40 100644 --- a/common/mlpcre/pcre_tests.ml +++ b/common/mlpcre/pcre_tests.ml @@ -42,6 +42,20 @@ let replace ?(global = false) patt subst subj = eprintf " %s\n%!" r; r +let split patt subj = + eprintf "PCRE.split %s ->%!" subj; + let s1, s2 = PCRE.split patt subj in + eprintf " (%s, %s)\n%!" s1 s2; + (s1, s2) + +let nsplit ?(max = 0) patt subj = + eprintf "PCRE.nsplit%s %s ->%!" + (if max = 0 then "" else sprintf " ~max:%d" max) + subj; + let ss = PCRE.nsplit ~max patt subj in + eprintf " [%s]\n%!" (String.concat "; " ss); + ss + let sub i = eprintf "PCRE.sub %d ->%!" i; let r = PCRE.sub i in @@ -60,6 +74,7 @@ let () = let re1 = compile "(a+)b" in let re2 = compile "(a+)(b*)" in let re3 = compile ~caseless:true "[^a-z0-9_]" in + let ws = compile "\\s+" in assert (matches re0 "ccaaabbbb" = true); assert (sub 0 = "aaab"); @@ -101,6 +116,20 @@ let () = assert (replace ~global:true re3 "-" "this is a\xc2\xa3FUNNY.name?" (* = "this-is-a-FUNNY-name-" if UTF-8 worked *) = "this-is-a--FUNNY-name-"); + + (* This also tests PCRE.split since that is used by nsplit. *) + assert (nsplit ~max:1 ws "a b c" = [ "a b c" ]); + assert (nsplit ~max:2 ws "a b c" = [ "a"; "b c" ]); + assert (nsplit ~max:3 ws "a b c" = [ "a"; "b"; "c" ]); + assert (nsplit ~max:10 ws "a b c" = [ "a"; "b"; "c" ]); + assert (nsplit ws "the cat sat on \t\t the mat." = + [ "the"; "cat"; "sat"; "on"; "the"; "mat." ]); + assert (nsplit ~max:5 ws "the cat sat on \t\t the mat." = + [ "the"; "cat"; "sat"; "on"; "the mat." ]); + assert (nsplit ws " the " = [ ""; "the"; "" ]); + assert (nsplit ws "the " = [ "the"; "" ]); + assert (nsplit ws " the" = [ ""; "the" ]); + assert (nsplit ws " \t the" = [ ""; "the" ]); with | Not_found -> failwith "one of the PCRE.sub functions unexpectedly raised Not_found"