diff --git a/src/applications/search/compiler/PhutilSearchQueryCompiler.php b/src/applications/search/compiler/PhutilSearchQueryCompiler.php index 0426180197..e20e2bb9e4 100644 --- a/src/applications/search/compiler/PhutilSearchQueryCompiler.php +++ b/src/applications/search/compiler/PhutilSearchQueryCompiler.php @@ -1,403 +1,421 @@ <?php final class PhutilSearchQueryCompiler extends Phobject { private $operators = '+ -><()~*:""&|'; private $query; private $stemmer; private $enableFunctions = false; const OPERATOR_NOT = 'not'; const OPERATOR_AND = 'and'; const OPERATOR_SUBSTRING = 'sub'; const OPERATOR_EXACT = 'exact'; + const OPERATOR_ABSENT = 'absent'; + const OPERATOR_PRESENT = 'present'; public function setOperators($operators) { $this->operators = $operators; return $this; } public function getOperators() { return $this->operators; } public function setStemmer(PhutilSearchStemmer $stemmer) { $this->stemmer = $stemmer; return $this; } public function getStemmer() { return $this->stemmer; } public function setEnableFunctions($enable_functions) { $this->enableFunctions = $enable_functions; return $this; } public function getEnableFunctions() { return $this->enableFunctions; } public function compileQuery(array $tokens) { assert_instances_of($tokens, 'PhutilSearchQueryToken'); $result = array(); foreach ($tokens as $token) { $result[] = $this->renderToken($token); } return $this->compileRenderedTokens($result); } public function compileLiteralQuery(array $tokens) { assert_instances_of($tokens, 'PhutilSearchQueryToken'); $result = array(); foreach ($tokens as $token) { if (!$token->isQuoted()) { continue; } $result[] = $this->renderToken($token); } return $this->compileRenderedTokens($result); } public function compileStemmedQuery(array $tokens) { assert_instances_of($tokens, 'PhutilSearchQueryToken'); $result = array(); foreach ($tokens as $token) { if ($token->isQuoted()) { continue; } $result[] = $this->renderToken($token, $this->getStemmer()); } return $this->compileRenderedTokens($result); } private function compileRenderedTokens(array $list) { if (!$list) { return null; } $list = array_unique($list); return implode(' ', $list); } public function newTokens($query) { $results = $this->tokenizeQuery($query); $tokens = array(); foreach ($results as $result) { $tokens[] = PhutilSearchQueryToken::newFromDictionary($result); } return $tokens; } private function tokenizeQuery($query) { $maximum_bytes = 1024; $query_bytes = strlen($query); if ($query_bytes > $maximum_bytes) { throw new PhutilSearchQueryCompilerSyntaxException( pht( 'Query is too long (%s bytes, maximum is %s bytes).', new PhutilNumber($query_bytes), new PhutilNumber($maximum_bytes))); } $query = phutil_utf8v($query); $length = count($query); $enable_functions = $this->getEnableFunctions(); $mode = 'scan'; $current_operator = array(); $current_token = array(); $current_function = null; $is_quoted = false; $tokens = array(); if ($enable_functions) { $operator_characters = '[~=+-]'; } else { $operator_characters = '[+-]'; } for ($ii = 0; $ii < $length; $ii++) { $character = $query[$ii]; if ($mode == 'scan') { if (preg_match('/^\s\z/u', $character)) { continue; } $mode = 'function'; } if ($mode == 'function') { $mode = 'operator'; if ($enable_functions) { $found = false; for ($jj = $ii; $jj < $length; $jj++) { if (preg_match('/^[a-zA-Z]\z/u', $query[$jj])) { continue; } if ($query[$jj] == ':') { $found = $jj; } break; } if ($found !== false) { $function = array_slice($query, $ii, ($jj - $ii)); $current_function = implode('', $function); if (!strlen($current_function)) { $current_function = null; } $ii = $jj; continue; } } } if ($mode == 'operator') { if (preg_match('/^\s\z/u', $character)) { continue; } if (preg_match('/^'.$operator_characters.'\z/', $character)) { $current_operator[] = $character; continue; } $mode = 'quote'; } if ($mode == 'quote') { if (preg_match('/^"\z/', $character)) { $is_quoted = true; $mode = 'token'; continue; } $mode = 'token'; } if ($mode == 'token') { $capture = false; $was_quoted = $is_quoted; if ($is_quoted) { if (preg_match('/^"\z/', $character)) { $capture = true; $mode = 'scan'; $is_quoted = false; } } else { if (preg_match('/^\s\z/u', $character)) { $capture = true; $mode = 'scan'; } if (preg_match('/^"\z/', $character)) { $capture = true; $mode = 'token'; $is_quoted = true; } } if ($capture) { $token = array( 'operator' => $current_operator, 'quoted' => $was_quoted, 'value' => $current_token, ); if ($enable_functions) { $token['function'] = $current_function; } $tokens[] = $token; $current_operator = array(); $current_token = array(); $current_function = null; continue; } else { $current_token[] = $character; } } } if ($is_quoted) { throw new PhutilSearchQueryCompilerSyntaxException( pht( 'Query contains unmatched double quotes.')); } // If the input query has trailing space, like "a b ", we may exit the // parser without a final token. if ($current_function !== null || $current_operator || $current_token) { $token = array( 'operator' => $current_operator, 'quoted' => false, 'value' => $current_token, ); if ($enable_functions) { $token['function'] = $current_function; } $tokens[] = $token; } $results = array(); foreach ($tokens as $token) { $value = implode('', $token['value']); $operator_string = implode('', $token['operator']); $is_quoted = $token['quoted']; switch ($operator_string) { case '-': $operator = self::OPERATOR_NOT; break; case '~': $operator = self::OPERATOR_SUBSTRING; break; case '=': $operator = self::OPERATOR_EXACT; break; case '+': $operator = self::OPERATOR_AND; break; case '': // See T12995. If this query term contains Chinese, Japanese or // Korean characters, treat the term as a substring term by default. // These languages do not separate words with spaces, so the term // search mode is normally useless. if ($enable_functions && !$is_quoted && phutil_utf8_is_cjk($value)) { $operator = self::OPERATOR_SUBSTRING; } else { $operator = self::OPERATOR_AND; } break; default: throw new PhutilSearchQueryCompilerSyntaxException( pht( 'Query has an invalid sequence of operators ("%s").', $operator_string)); } if (!strlen($value)) { $require_value = $is_quoted; switch ($operator) { + case self::OPERATOR_NOT: + if ($enable_functions && ($token['function'] !== null)) { + $operator = self::OPERATOR_ABSENT; + $value = null; + } else { + $require_value = true; + } + break; + case self::OPERATOR_SUBSTRING: + if ($enable_functions && ($token['function'] !== null)) { + $operator = self::OPERATOR_PRESENT; + $value = null; + } else { + $require_value = true; + } + break; default: $require_value = true; break; } if ($require_value) { throw new PhutilSearchQueryCompilerSyntaxException( pht( 'Query contains a token ("%s") with no search term. Query '. 'tokens specify text to search for.', $this->getDisplayToken($token))); } } $result = array( 'operator' => $operator, 'quoted' => $is_quoted, 'value' => $value, ); if ($enable_functions) { $result['function'] = $token['function']; } $results[] = $result; } return $results; } private function renderToken( PhutilSearchQueryToken $token, PhutilSearchStemmer $stemmer = null) { $value = $token->getValue(); if ($stemmer) { $value = $stemmer->stemToken($value); } $value = $this->quoteToken($value); $operator = $token->getOperator(); $prefix = $this->getOperatorPrefix($operator); $value = $prefix.$value; return $value; } private function getOperatorPrefix($operator) { $operators = $this->operators; switch ($operator) { case self::OPERATOR_AND: $prefix = $operators[0]; break; case self::OPERATOR_NOT: $prefix = $operators[2]; break; default: throw new PhutilSearchQueryCompilerSyntaxException( pht( 'Unsupported operator prefix "%s".', $operator)); } if ($prefix == ' ') { $prefix = null; } return $prefix; } private function quoteToken($value) { $operators = $this->operators; $open_quote = $this->operators[10]; $close_quote = $this->operators[11]; return $open_quote.$value.$close_quote; } private function getDisplayToken(array $token) { if (isset($token['function'])) { $function = $token['function'].':'; } else { $function = ''; } $operator_string = implode('', $token['operator']); $value = implode('', $token['value']); $is_quoted = $token['quoted']; if ($is_quoted) { $value = $this->quoteToken($value); } return sprintf('%s%s%s', $function, $operator_string, $value); } } diff --git a/src/applications/search/compiler/__tests__/PhutilSearchQueryCompilerTestCase.php b/src/applications/search/compiler/__tests__/PhutilSearchQueryCompilerTestCase.php index 0369cc9cd3..987b7d354a 100644 --- a/src/applications/search/compiler/__tests__/PhutilSearchQueryCompilerTestCase.php +++ b/src/applications/search/compiler/__tests__/PhutilSearchQueryCompilerTestCase.php @@ -1,233 +1,247 @@ <?php final class PhutilSearchQueryCompilerTestCase extends PhutilTestCase { public function testCompileQueries() { $tests = array( '' => null, 'cat dog' => '+"cat" +"dog"', 'cat -dog' => '+"cat" -"dog"', 'cat-dog' => '+"cat-dog"', // If there are spaces after an operator, the operator applies to the // next search term. 'cat - dog' => '+"cat" -"dog"', // Double quotes serve as delimiters even if there is no whitespace // between terms. '"cat"dog' => '+"cat" +"dog"', // This query is too long. str_repeat('x', 2048) => false, // Multiple operators are not permitted. '++cat' => false, '+-cat' => false, '--cat' => false, // Stray operators are not permitted. '+' => false, 'cat +' => false, // Double quotes must be paired. '"' => false, 'cat "' => false, '"cat' => false, 'A"' => false, 'A"B"' => '+"A" +"B"', // Trailing whitespace should be discarded. 'a b ' => '+"a" +"b"', // Functions must have search text. '""' => false, '-' => false, ); $this->assertCompileQueries($tests); // Test that we compile queries correctly if the operators have been // swapped to use "AND" by default. $operator_tests = array( 'cat dog' => '"cat" "dog"', 'cat -dog' => '"cat" -"dog"', ); $this->assertCompileQueries($operator_tests, ' |-><()~*:""&\''); // Test that we compile queries correctly if the quote operators have // been swapped to differ. $quote_tests = array( 'cat dog' => '+[cat] +[dog]', 'cat -dog' => '+[cat] -[dog]', ); $this->assertCompileQueries($quote_tests, '+ -><()~*:[]&|'); } public function testCompileQueriesWithStemming() { $stemming_tests = array( 'cat dog' => array( null, '+"cat" +"dog"', ), 'cats dogs' => array( null, '+"cat" +"dog"', ), 'cats "dogs"' => array( '+"dogs"', '+"cat"', ), '"blessed blade" of the windseeker' => array( '+"blessed blade"', '+"of" +"the" +"windseek"', ), 'mailing users for mentions on tasks' => array( null, '+"mail" +"user" +"for" +"mention" +"on" +"task"', ), ); $stemmer = new PhutilSearchStemmer(); $this->assertCompileQueries($stemming_tests, null, $stemmer); } public function testCompileQueriesWithFunctions() { $op_and = PhutilSearchQueryCompiler::OPERATOR_AND; $op_sub = PhutilSearchQueryCompiler::OPERATOR_SUBSTRING; $op_exact = PhutilSearchQueryCompiler::OPERATOR_EXACT; + $op_present = PhutilSearchQueryCompiler::OPERATOR_PRESENT; + $op_absent = PhutilSearchQueryCompiler::OPERATOR_ABSENT; $mao = "\xE7\x8C\xAB"; $function_tests = array( 'cat' => array( array(null, $op_and, 'cat'), ), ':cat' => array( array(null, $op_and, 'cat'), ), 'title:cat' => array( array('title', $op_and, 'cat'), ), 'title:cat:dog' => array( array('title', $op_and, 'cat:dog'), ), 'title:~cat' => array( array('title', $op_sub, 'cat'), ), 'cat title:="Meow Meow"' => array( array(null, $op_and, 'cat'), array('title', $op_exact, 'Meow Meow'), ), 'title:cat title:dog' => array( array('title', $op_and, 'cat'), array('title', $op_and, 'dog'), ), '~"core and seven years ag"' => array( array(null, $op_sub, 'core and seven years ag'), ), $mao => array( array(null, $op_sub, $mao), ), '+'.$mao => array( array(null, $op_and, $mao), ), '~'.$mao => array( array(null, $op_sub, $mao), ), '"'.$mao.'"' => array( array(null, $op_and, $mao), ), 'title:' => false, 'title:+' => false, 'title:+""' => false, + 'title:""' => false, + + 'title:~' => array( + array('title', $op_present, null), + ), + + 'title:-' => array( + array('title', $op_absent, null), + ), + + '~' => false, + '-' => false, ); $this->assertCompileFunctionQueries($function_tests); } private function assertCompileQueries( array $tests, $operators = null, PhutilSearchStemmer $stemmer = null) { foreach ($tests as $input => $expect) { $caught = null; $query = null; $literal_query = null; $stemmed_query = null; try { $compiler = new PhutilSearchQueryCompiler(); if ($operators !== null) { $compiler->setOperators($operators); } if ($stemmer !== null) { $compiler->setStemmer($stemmer); } $tokens = $compiler->newTokens($input); if ($stemmer) { $literal_query = $compiler->compileLiteralQuery($tokens); $stemmed_query = $compiler->compileStemmedQuery($tokens); } else { $query = $compiler->compileQuery($tokens); } } catch (PhutilSearchQueryCompilerSyntaxException $ex) { $caught = $ex; } if ($caught !== null) { $query = false; $literal_query = false; $stemmed_query = false; } if (!$stemmer) { $this->assertEqual( $expect, $query, pht('Compilation of query: %s', $input)); } else { $this->assertEqual( $expect, ($literal_query === false) ? false : array($literal_query, $stemmed_query), pht('Stemmed compilation of query: %s', $input)); } } } private function assertCompileFunctionQueries(array $tests) { foreach ($tests as $input => $expect) { $compiler = id(new PhutilSearchQueryCompiler()) ->setEnableFunctions(true); try { $tokens = $compiler->newTokens($input); $result = array(); foreach ($tokens as $token) { $result[] = array( $token->getFunction(), $token->getOperator(), $token->getValue(), ); } } catch (PhutilSearchQueryCompilerSyntaxException $ex) { $result = false; } $this->assertEqual( $expect, $result, pht('Function compilation of query: %s', $input)); } } }