Matching Nested Constructs in JavaScript

In the past, I've touched on using regexes to match nested constructs up to a predetermined depth, which is the best you can do unless you're using one of the three regex engines (Perl, PCRE, and .NET) which are currently able to handle true recursion.

Well, recently I wanted to be able to support unlimited nesting depth in a fast, flexible, and easy to use way when matching strings in JavaScript, so here's the code I wrote for it. Basic documentation and examples are included in the code comments.

// (c) 2007 Steven Levithan <stevenlevithan.com>
// MIT License

/*** matchRecursive
	accepts a string to search and a format (start and end tokens separated by "...").
	returns an array of matches, allowing nested instances of format.

	examples:
		matchRecursive("test",          "(...)")   -> []
		matchRecursive("(t(e)s)()t",    "(...)")   -> ["t(e)s", ""]
		matchRecursive("t<e>>st",       "<...>")   -> ["e"]
		matchRecursive("t<<e>st",       "<...>")   -> ["e"]
		matchRecursive("t<<e>>st",      "<...>")   -> ["<e>"]
		matchRecursive("<|t<e<|s|>t|>", "<|...|>") -> ["t<e<|s|>t"]
*/
var matchRecursive = function () {
	var	formatParts = /^([\S\s]+?)\.\.\.([\S\s]+)/,
		metaChar = /[-[\]{}()*+?.\\^$|,]/g,
		escape = function (str) {
			return str.replace(metaChar, "\\$&");
		};

	return function (str, format) {
		var p = formatParts.exec(format);
		if (!p) throw new Error("format must include start and end tokens separated by '...'");
		if (p[1] == p[2]) throw new Error("start and end format tokens cannot be identical");

		var	opener = p[1],
			closer = p[2],
			/* Use an optimized regex when opener and closer are one character each */
			iterator = new RegExp(format.length == 5 ? "["+escape(opener+closer)+"]" : escape(opener)+"|"+escape(closer), "g"),
			results = [],
			openTokens, matchStartIndex, match;

		do {
			openTokens = 0;
			while (match = iterator.exec(str)) {
				if (match[0] == opener) {
					if (!openTokens)
						matchStartIndex = iterator.lastIndex;
					openTokens++;
				} else if (openTokens) {
					openTokens--;
					if (!openTokens)
						results.push(str.slice(matchStartIndex, match.index));
				}
			}
		} while (openTokens && (iterator.lastIndex = matchStartIndex));

		return results;
	};
}();

You can download the code here.

Note that the format argument expects a simple string; not a regular expression. However, the code could easily be modified to work with regexes if that's what you were after.

Update: I've posted an alternative version which accepts regex patterns as the format as matchRecursiveRegExp.

2 thoughts on “Matching Nested Constructs in JavaScript”

Leave a Reply

Your email address will not be published. Required fields are marked *