Problem
This is a parsing function that will at tildes (~) to end of search terms in certain circumstances.
Example an inputs and outputs:
Input: Output: name:(john doe) name:(john~ doe~) name:[andy TO charlie] name:[andy TO charlie] john doe john~ doe~ james NOT jane james~ NOT jane james NOT (james smith) james~ NOT (james smith) james NOT jane smith james~ NOT jane smith~ name:"john doe" australia name:"john doe" australia~
function addTilde(string) {
if (!/[[[]~"(NOT)-!d()(OR)(AND)&|: ]/.test(string)) {
string = string.concat("~");
}
return string;
};
function fuzzQuery(rawQuery) {
/*split the string into spaces, brackets, double quotes and words*/
re = /(?=[()[] "])|(?=[^W])b/;
strSplit = rawQuery.split(re);
newQuery = "";
for (var i = 0; i < strSplit.length; i++) {
var s = strSplit[i];
var newElement = "";
/*if it contains a [ or "*/
if (s.indexOf("x22") != -1 || s.indexOf("[") != -1) {
/*determine closing symbol*/
var closingSymbol;
if (s == "x22") {
closingSymbol = "x22";
newElement = newElement.concat(strSplit[i++]); /*need to skip opening one for double quotes*/
} else closingSymbol = "]";
/*concat elements together until closing element found)*/
do {
newElement = newElement.concat(strSplit[i]);
}
while (strSplit[i++] != closingSymbol)
}
/*if it contains a NOT*/
else if (s.indexOf("NOT") != -1) {
newElement = strSplit[i++]; /*concat the NOT*/
/*concat any spaces*/
while (strSplit[i] == " ") {
newElement = newElement.concat(strSplit[i++]);
}
if (strSplit[i] == "(") {
do {
newElement = newElement.concat(strSplit[i]);
}
while (strSplit[i++] != ")")
} else newElement = newElement.concat(strSplit[i++]);
} else(newElement = strSplit[i]);
newElement = addTilde(newElement);
newQuery = newQuery.concat(newElement);
}
return newQuery;
};
Now fuzzQuery
is quite a long method.
It essentially has five parts.
- Split the initial query out into elements.
-
Loop through each element.
a) Concat square brackets and double quotes.
else b) concat NOTs.
now add tilde to element if appropriate
-
Return the join the elements back together and return the new query.
What I was thinking is that you could pass off steps two and three to their own methods, so that the whole query looks something like (but not exactly like!):
function fuzzQuery(rawQuery)
{
strSplit = splitQuery(rawQuery);
concatSqrAndDblQuotes(strSplit);
concatNots(strSplit);
return putBackTogether(strSplit);
}
ie.
function doSquareAndDblQuotes(strSplit, i) {
if (s.indexOf("x22") != -1 || s.indexOf("[") != -1) {
/*determine closing symbol*/
var closingSymbol;
if (s == "x22") {
closingSymbol = "x22";
newElement = newElement.concat(strSplit[i++]); /*need to skip opening one for double quotes*/
} else closingSymbol = "]";
/*concat elements together until closing element found)*/
do {
newElement = newElement.concat(strSplit[i]);
}
while (strSplit[i++] != closingSymbol)
}
return newElement;
}
But the problem is here that we’d need to be keeping track of a few variables being changed in this function. ie. the i counter, and whether or not that if statement was executed. So you could start using globals (is that even a thing in javascript?)… and it gets messy.
So possibly another way, would be to create an object that you pass in, and return, which keeps track of these variables.
What do you think?
Solution
You aren’t using regular expressions to your advantage. Capture, don’t split. Capturing helps you analyze the tokens you are interested in. Splitting just gets you the location of the delimiters.
function fuzzQuery(rawQuery) {
"use strict";
// ( 1 ) ( 2 ) ( 3 ) ( 4 ) ( 5 ) ( 6 )
var re = /s*(?:(NOT)s+)?([a-z]+:)?(?:("[^"]*")|(([^)]*))|([[^]]*])|([a-z]+))s*/g;
var matches;
var lastIndex = -1;
while (matches = re.exec(rawQuery)) {
var relOp = matches[1],
qualifier = matches[2],
quotedStr = matches[3],
parensStr = matches[4],
bracketStr = matches[5],
bareWord = matches[6];
lastIndex = re.lastIndex;
console.log("relOp=" + relOp +
", qualifier=" + qualifier +
", quotedStr=" + quotedStr +
", parensStr=" + parensStr +
", bracketStr=" + bracketStr +
", bareWord=" + bareWord);
}
if (lastIndex != rawQuery.length) {
console.log("Junk=" + rawQuery.substring(lastIndex));
}
}
Examples:
-
name:(john doe)
relOp=undefined, qualifier=name:, quotedStr=undefined, parensStr=(john doe), bracketStr=undefined, bareWord=undefined
-
name:[andy TO charlie]
relOp=undefined, qualifier=name:, quotedStr=undefined, parensStr=undefined, bracketStr=[andy TO charlie], bareWord=undefined
-
john doe
relOp=undefined, qualifier=undefined, quotedStr=undefined, parensStr=undefined, bracketStr=undefined, bareWord=john relOp=undefined, qualifier=undefined, quotedStr=undefined, parensStr=undefined, bracketStr=undefined, bareWord=doe
-
james NOT jane
relOp=undefined, qualifier=undefined, quotedStr=undefined, parensStr=undefined, bracketStr=undefined, bareWord=james relOp=NOT, qualifier=undefined, quotedStr=undefined, parensStr=undefined, bracketStr=undefined, bareWord=jane
-
james NOT (james smith)
relOp=undefined, qualifier=undefined, quotedStr=undefined, parensStr=undefined, bracketStr=undefined, bareWord=james relOp=NOT, qualifier=undefined, quotedStr=undefined, parensStr=(james smith), bracketStr=undefined, bareWord=undefined
-
james NOT jane smith
relOp=undefined, qualifier=undefined, quotedStr=undefined, parensStr=undefined, bracketStr=undefined, bareWord=james relOp=NOT, qualifier=undefined, quotedStr=undefined, parensStr=undefined, bracketStr=undefined, bareWord=jane relOp=undefined, qualifier=undefined, quotedStr=undefined, parensStr=undefined, bracketStr=undefined, bareWord=smith
-
name:"john doe" australia
relOp=undefined, qualifier=undefined, quotedStr="john doe", parensStr=undefined, bracketStr=undefined, bareWord=undefined relOp=undefined, qualifier=undefined, quotedStr=undefined, parensStr=undefined, bracketStr=undefined, bareWord=australia