includes/search.php
changeset 1201 9593e62929d1
parent 1194 70169f572190
child 1227 bdac73ed481e
equal deleted inserted replaced
1200:0f94802001ee 1201:9593e62929d1
   184         $term = strtolower($term);
   184         $term = strtolower($term);
   185       $where_any[] = $term;
   185       $where_any[] = $term;
   186     }
   186     }
   187 
   187 
   188     $col_word = ( $case_sensitive ) ? 'word' : 'word_lcase';
   188     $col_word = ( $case_sensitive ) ? 'word' : 'word_lcase';
   189     $where_any = ( count($where_any) > 0 ) ? '( ' . $col_word . ' LIKE \'%' . implode('%\' OR ' . $col_word . ' LIKE \'%', $where_any) . '%\' )' : '';
   189     $where_any_str = ( count($where_any) > 0 ) ? '( ' . $col_word . ' LIKE \'%' . implode('%\' OR ' . $col_word . ' LIKE \'%', $where_any) . '%\' )' : '';
   190 
   190 
   191     // generate query
   191     // generate query
   192     $sql = "SELECT word, page_names FROM " . table_prefix . "search_index WHERE {$where_any}";
   192     $sql = "SELECT word, page_names FROM " . table_prefix . "search_index WHERE {$where_any_str}";
   193     if ( !($q = $db->sql_query($sql)) )
   193     if ( !($q = $db->sql_query($sql)) )
   194       $db->_die('Error is in perform_search(), includes/search.php, query 1');
   194       $db->_die('Error is in perform_search(), includes/search.php, query 1');
   195 
   195 
   196     $word_tracking = array();
   196     $word_tracking = array();
   197     if ( $row = $db->fetchrow($q) )
   197     if ( $row = $db->fetchrow($q) )
   198     {
   198     {
   199       do
   199       do
   200       {
   200       {
   201         // get page list
   201         // get page list
   202         $pages =& $row['page_names'];
   202         $pages =& $row['page_names'];
   203         if ( strpos($pages, ',') )
   203           
   204         {
   204         // Find page IDs that contain commas
   205           // the term occurs in more than one page
   205         // This should never happen because commas are escaped by sanitize_page_id(). Nevertheless for compatibility with older
   206 
   206         // databases, and to alleviate the concerns of hackers, we'll accommodate for page IDs with commas here by checking for
   207           // Find page IDs that contain commas
   207         // IDs that don't match the pattern for stringified page ID + namespace. If it doesn't match, that means it's a continuation
   208           // This should never happen because commas are escaped by sanitize_page_id(). Nevertheless for compatibility with older
   208         // of the previous ID and should be concatenated to the previous entry.
   209           // databases, and to alleviate the concerns of hackers, we'll accommodate for page IDs with commas here by checking for
   209         $matches = strpos($pages, ',') ? explode(',', $pages) : array($pages);
   210           // IDs that don't match the pattern for stringified page ID + namespace. If it doesn't match, that means it's a continuation
   210         $prev = false;
   211           // of the previous ID and should be concatenated to the previous entry.
   211         foreach ( $matches as $i => $_ )
   212           $matches = explode(',', $pages);
   212         {
   213           $prev = false;
   213           $match =& $matches[$i];
   214           foreach ( $matches as $i => $_ )
   214           if ( !preg_match("/^ns=$ns_list;pid=(.+)$/", $match) && $prev )
   215           {
   215           {
   216             $match =& $matches[$i];
   216             $matches[$prev] .= ',' . $match;
   217             if ( !preg_match("/^ns=$ns_list;pid=(.+)$/", $match) && $prev )
   217             unset($match, $matches[$i]);
   218             {
   218             continue;
   219               $matches[$prev] .= ',' . $match;
   219           }
   220               unset($match, $matches[$i]);
   220           $prev = $i;
   221               continue;
   221         }
   222             }
   222         unset($match);
   223             $prev = $i;
   223 
   224           }
   224         // Iterate through each of the results, assigning scores based on how many times the page has shown up.
   225           unset($match);
   225         // This works because this phase of the search is strongly word-based not page-based. If a page shows up
   226 
   226         // multiple times while fetching the result rows from the search_index table, it simply means that page
   227           // Iterate through each of the results, assigning scores based on how many times the page has shown up.
   227         // contains more than one of the terms the user searched for.
   228           // This works because this phase of the search is strongly word-based not page-based. If a page shows up
   228 
   229           // multiple times while fetching the result rows from the search_index table, it simply means that page
   229         foreach ( $matches as $match )
   230           // contains more than one of the terms the user searched for.
   230         {
   231 
   231           $word_cs = (( $case_sensitive ) ? $row['word'] : strtolower($row['word']));
   232           foreach ( $matches as $match )
   232           if ( isset($word_tracking[$match]) && in_array($word_cs, $word_tracking[$match]) )
   233           {
   233           {
   234             $word_cs = (( $case_sensitive ) ? $row['word'] : strtolower($row['word']));
   234             continue;
   235             if ( isset($word_tracking[$match]) && in_array($word_cs, $word_tracking[$match]) )
   235           }
   236             {
   236           if ( isset($word_tracking[$match]) )
   237               continue;
   237           {
   238             }
       
   239             if ( isset($word_tracking[$match]) )
   238             if ( isset($word_tracking[$match]) )
   240             {
   239             {
   241               if ( isset($word_tracking[$match]) )
   240               $word_tracking[$match][] = $word_cs;
   242               {
       
   243                 $word_tracking[$match][] = ($word_cs);
       
   244               }
       
   245             }
   241             }
   246             else
   242           }
   247             {
   243           else
   248               $word_tracking[$match] = array($word_cs);
   244           {
   249             }
   245             $word_tracking[$match] = array($word_cs);
   250             $inc = 1;
   246           }
   251 
       
   252             // Is this search term present in the page's title? If so, give extra points
       
   253             preg_match("/^ns=$ns_list;pid=(.+)$/", $pages, $piecesparts);
       
   254             $title = get_page_title_ns($piecesparts[2], $piecesparts[1]);
       
   255             
       
   256             $test_func = ( $case_sensitive ) ? 'strstr' : 'stristr';
       
   257             if ( $test_func($title, $row['word']) || $test_func($piecesparts[2], $row['word']) )
       
   258             {
       
   259               $inc = 1.5;
       
   260             }
       
   261           
   247           
   262             if ( isset($scores[$match]) )
   248           // echo '<pre>' . print_r($word_tracking, true) . '</pre>';
   263             {
       
   264               $scores[$match] = $scores[$match] + $inc;
       
   265             }
       
   266             else
       
   267             {
       
   268               $scores[$match] = $inc;
       
   269             }
       
   270           }
       
   271         }
       
   272         else
       
   273         {
       
   274           // the term only occurs in one page
       
   275           $word_cs = (( $case_sensitive ) ? $row['word'] : strtolower($row['word']));
       
   276           
   249           
   277           if ( isset($word_tracking[$pages]) && in_array($word_cs, $word_tracking[$pages]) )
       
   278           {
       
   279             continue;
       
   280           }
       
   281           if ( isset($word_tracking[$pages]) )
       
   282           {
       
   283             if ( isset($word_tracking[$pages]) )
       
   284             {
       
   285               $word_tracking[$pages][] = ($word_cs);
       
   286             }
       
   287           }
       
   288           else
       
   289           {
       
   290             $word_tracking[$pages] = array($word_cs);
       
   291           }
       
   292           $inc = 1;
   250           $inc = 1;
   293 
   251 
   294           // Is this search term present in the page's title? If so, give extra points
   252           // Is this search term present in the page's title? If so, give extra points
   295           preg_match("/^ns=$ns_list;pid=(.+)$/", $pages, $piecesparts);
   253           preg_match("/^ns=$ns_list;pid=(.+)$/", $match, $piecesparts);
   296           $title = get_page_title_ns($piecesparts[2], $piecesparts[1]);
   254           $title = get_page_title_ns($piecesparts[2], $piecesparts[1]);
   297           
   255           
   298           $test_func = ( $case_sensitive ) ? 'strstr' : 'stristr';
   256           $test_func = ( $case_sensitive ) ? 'strstr' : 'stristr';
   299           if ( $test_func($title, $row['word']) || $test_func($piecesparts[2], $row['word']) )
   257           if ( $test_func($title, $row['word']) || $test_func($piecesparts[2], $row['word']) )
   300           {
   258           {
   301             $inc = 1.5;
   259             $inc = 1.5;
   302           }
   260           }
   303           
   261           
   304           if ( isset($scores[$pages]) )
   262           // increase points if 2 or more words match a phrase in the title
   305           {
   263           for ( $i = 0; $i < count($where_any) - 1; $i++ )
   306             $scores[$pages] = $scores[$pages] + $inc;
   264           {
       
   265             $phrase = "{$where_any[$i]} {$where_any[$i + 1]}";
       
   266             if ( $test_func($title, $phrase) )
       
   267             {
       
   268               $inc *= 1.25;
       
   269             }
       
   270           }
       
   271           
       
   272           // Deduct points if there are few similarities between the words
       
   273           $lev_array = array();
       
   274           foreach ( $where_any as $qword )
       
   275           {
       
   276             if ( strstr($word_cs, $qword) )
       
   277               $lev_array[ $qword ] = levenshtein($qword, $word_cs);
       
   278           }
       
   279           if ( min($lev_array) > 3 )
       
   280           {
       
   281             $inc /= array_sum($lev_array) / count($lev_array);
       
   282           }
       
   283           
       
   284           if ( isset($scores[$match]) )
       
   285           {
       
   286             $scores[$match] = $scores[$match] + $inc;
   307           }
   287           }
   308           else
   288           else
   309           {
   289           {
   310             $scores[$pages] = $inc;
   290             $scores[$match] = $inc;
   311           }
   291           }
   312         }
   292         }
   313       }
   293       }
   314       while ( $row = $db->fetchrow($q) );
   294       while ( $row = $db->fetchrow($q) );
   315     }
   295     }
   316     $db->free_result($q);
   296     $db->free_result($q);
   317 
   297     
   318     //
   298     //
   319     // STAGE 2: FIRST ELIMINATION ROUND
   299     // STAGE 2: FIRST ELIMINATION ROUND
   320     // Iterate through the list of required terms. If a given page is not found to have the required term, eliminate it
   300     // Iterate through the list of required terms. If a given page is not found to have the required term, eliminate it
   321     //
   301     //
   322 
   302 
   398         {
   378         {
   399           if ( $test_func($title, $word) )
   379           if ( $test_func($title, $word) )
   400             $inc += 1.5;
   380             $inc += 1.5;
   401           else if ( $test_func($row['page_text'], $word) )
   381           else if ( $test_func($row['page_text'], $word) )
   402             $inc += 1.0;
   382             $inc += 1.0;
       
   383         }
       
   384         
       
   385         // increase points if 2 or more words match a phrase in the title
       
   386         for ( $i = 0; $i < count($word_list) - 1; $i++ )
       
   387         {
       
   388           $phrase = "{$word_list[$i]} {$word_list[$i + 1]}";
       
   389           if ( $test_func($title, $phrase) )
       
   390             $inc *= 1.25;
       
   391           else if ( $test_func($row['page_text'], $phrase) )
       
   392             $inc *= 1.125;
   403         }
   393         }
   404         
   394         
   405         if ( isset($scores[$id]) )
   395         if ( isset($scores[$id]) )
   406         {
   396         {
   407           $scores[$id] = $scores[$id] + $inc;
   397           $scores[$id] = $scores[$id] + $inc;
   557   // Sort scores array
   547   // Sort scores array
   558   arsort($scores);
   548   arsort($scores);
   559 
   549 
   560   // Divisor for calculating relevance scores
   550   // Divisor for calculating relevance scores
   561   $divisor = ( count($query['any']) + count($query_phrase['any']) + count($query['req']) + count($query['not']) ) * 1.5;
   551   $divisor = ( count($query['any']) + count($query_phrase['any']) + count($query['req']) + count($query['not']) ) * 1.5;
       
   552   $divisor = max($divisor, max($scores));
   562   
   553   
   563   foreach ( $scores as $page_id => $score )
   554   foreach ( $scores as $page_id => $score )
   564   {
   555   {
   565     if ( !isset($page_data[$page_id]) )
   556     if ( !isset($page_data[$page_id]) )
   566       // It's possible that $scores contains a score for a page that was later eliminated because it contained a disallowed term
   557       // It's possible that $scores contains a score for a page that was later eliminated because it contained a disallowed term