190 $term = escape_string_like($term); |
191 $term = escape_string_like($term); |
191 if ( !$case_sensitive ) |
192 if ( !$case_sensitive ) |
192 $term = strtolower($term); |
193 $term = strtolower($term); |
193 $where_any[] = $term; |
194 $where_any[] = $term; |
194 } |
195 } |
195 |
196 |
196 $col_word = ( $case_sensitive ) ? 'word' : 'lcase(word)'; |
197 $col_word = ( $case_sensitive ) ? 'word' : 'lcase(word)'; |
197 $where_any = ( count($where_any) > 0 ) ? '( ' . $col_word . ' = \'' . implode('\' OR ' . $col_word . ' = \'', $where_any) . '\' )' : ''; |
198 $where_any = ( count($where_any) > 0 ) ? '( ' . $col_word . ' = \'' . implode('\' OR ' . $col_word . ' = \'', $where_any) . '\' )' : ''; |
198 |
199 |
199 // generate query |
200 // generate query |
200 // using a GROUP BY here ensures that the same word with a different case isn't counted as 2 words - it's all melted back |
201 // using a GROUP BY here ensures that the same word with a different case isn't counted as 2 words - it's all melted back |
201 // into one later in the processing stages |
202 // into one later in the processing stages |
202 $group_by = ( $case_sensitive ) ? '' : ' GROUP BY lcase(word);'; |
203 // $group_by = ( $case_sensitive ) ? '' : ' GROUP BY lcase(word);'; |
203 $sql = "SELECT word, page_names FROM " . table_prefix . "search_index WHERE {$where_any}{$group_by}"; |
204 $sql = "SELECT word, page_names FROM " . table_prefix . "search_index WHERE {$where_any}"; |
204 if ( !($q = $db->sql_unbuffered_query($sql)) ) |
205 if ( !($q = $db->sql_unbuffered_query($sql)) ) |
205 $db->_die('Error is in perform_search(), includes/search.php, query 1'); |
206 $db->_die('Error is in perform_search(), includes/search.php, query 1'); |
206 |
207 |
207 $word_tracking = array(); |
208 $word_tracking = array(); |
208 if ( $row = $db->fetchrow() ) |
209 if ( $row = $db->fetchrow() ) |
209 { |
210 { |
210 do |
211 do |
211 { |
212 { |
212 // get page list |
213 // get page list |
213 $pages =& $row['page_names']; |
214 $pages =& $row['page_names']; |
214 $ns_list = '(' . implode('|', array_keys($paths->nslist)) . ')'; |
|
215 if ( strpos($pages, ',') ) |
215 if ( strpos($pages, ',') ) |
216 { |
216 { |
217 // the term occurs in more than one page |
217 // the term occurs in more than one page |
218 |
218 |
219 // Find page IDs that contain commas |
219 // Find page IDs that contain commas |
220 // This should never happen because commas are escaped by sanitize_page_id(). Nevertheless for compatibility with older |
220 // This should never happen because commas are escaped by sanitize_page_id(). Nevertheless for compatibility with older |
221 // databases, and to alleviate the concerns of hackers, we'll accommodate for page IDs with commas here by checking for |
221 // databases, and to alleviate the concerns of hackers, we'll accommodate for page IDs with commas here by checking for |
222 // IDs that don't match the pattern for stringified page ID + namespace. If it doesn't match, that means it's a continuation |
222 // IDs that don't match the pattern for stringified page ID + namespace. If it doesn't match, that means it's a continuation |
223 // of the previous ID and should be concatenated to the previous entry. |
223 // of the previous ID and should be concatenated to the previous entry. |
233 continue; |
233 continue; |
234 } |
234 } |
235 $prev = $i; |
235 $prev = $i; |
236 } |
236 } |
237 unset($match); |
237 unset($match); |
238 |
238 |
239 // Iterate through each of the results, assigning scores based on how many times the page has shown up. |
239 // Iterate through each of the results, assigning scores based on how many times the page has shown up. |
240 // This works because this phase of the search is strongly word-based not page-based. If a page shows up |
240 // This works because this phase of the search is strongly word-based not page-based. If a page shows up |
241 // multiple times while fetching the result rows from the search_index table, it simply means that page |
241 // multiple times while fetching the result rows from the search_index table, it simply means that page |
242 // contains more than one of the terms the user searched for. |
242 // contains more than one of the terms the user searched for. |
243 |
243 |
244 foreach ( $matches as $match ) |
244 foreach ( $matches as $match ) |
245 { |
245 { |
|
246 $word_cs = (( $case_sensitive ) ? $row['word'] : strtolower($row['word'])); |
|
247 if ( isset($word_tracking[$match]) && in_array($word_cs, $word_tracking[$match]) ) |
|
248 { |
|
249 continue; |
|
250 } |
|
251 if ( isset($word_tracking[$match]) ) |
|
252 { |
|
253 if ( isset($word_tracking[$match]) ) |
|
254 { |
|
255 $word_tracking[$match][] = ($word_cs); |
|
256 } |
|
257 } |
|
258 else |
|
259 { |
|
260 $word_tracking[$match] = array($word_cs); |
|
261 } |
|
262 $inc = 1; |
|
263 |
|
264 // Is this search term present in the page's title? If so, give extra points |
|
265 preg_match("/^ns=$ns_list;pid=(.+)$/", $match, $piecesparts); |
|
266 $pathskey = $paths->nslist[ $piecesparts[1] ] . sanitize_page_id($piecesparts[2]); |
|
267 if ( isset($paths->pages[$pathskey]) ) |
|
268 { |
|
269 $test_func = ( $case_sensitive ) ? 'strstr' : 'stristr'; |
|
270 if ( $test_func($paths->pages[$pathskey]['name'], $row['word']) || $test_func($paths->pages[$pathskey]['urlname_nons'], $row['word']) ) |
|
271 { |
|
272 $inc = 1.5; |
|
273 } |
|
274 } |
246 if ( isset($scores[$match]) ) |
275 if ( isset($scores[$match]) ) |
247 { |
276 { |
248 $scores[$match]++; |
277 $scores[$match] = $scores[$match] + $inc; |
249 } |
278 } |
250 else |
279 else |
251 { |
280 { |
252 $scores[$match] = 1; |
281 $scores[$match] = $inc; |
253 } |
|
254 if ( isset($word_tracking[$match]) ) |
|
255 { |
|
256 $word_tracking[$match][] = $row['word']; |
|
257 } |
|
258 else |
|
259 { |
|
260 $word_tracking[$match] = array($row['word']); |
|
261 } |
282 } |
262 } |
283 } |
263 } |
284 } |
264 else |
285 else |
265 { |
286 { |
266 // the term only occurs in one page |
287 // the term only occurs in one page |
|
288 $word_cs = (( $case_sensitive ) ? $row['word'] : strtolower($row['word'])); |
|
289 if ( isset($word_tracking[$pages]) && in_array($word_cs, $word_tracking[$pages]) ) |
|
290 { |
|
291 continue; |
|
292 } |
|
293 if ( isset($word_tracking[$pages]) ) |
|
294 { |
|
295 if ( isset($word_tracking[$pages]) ) |
|
296 { |
|
297 $word_tracking[$pages][] = ($word_cs); |
|
298 } |
|
299 } |
|
300 else |
|
301 { |
|
302 $word_tracking[$pages] = array($word_cs); |
|
303 } |
|
304 $inc = 1; |
|
305 |
|
306 // Is this search term present in the page's title? If so, give extra points |
|
307 preg_match("/^ns=$ns_list;pid=(.+)$/", $pages, $piecesparts); |
|
308 $pathskey = $paths->nslist[ $piecesparts[1] ] . sanitize_page_id($piecesparts[2]); |
|
309 if ( isset($paths->pages[$pathskey]) ) |
|
310 { |
|
311 $test_func = ( $case_sensitive ) ? 'strstr' : 'stristr'; |
|
312 if ( $test_func($paths->pages[$pathskey]['name'], $row['word']) || $test_func($paths->pages[$pathskey]['urlname_nons'], $row['word']) ) |
|
313 { |
|
314 $inc = 1.5; |
|
315 } |
|
316 } |
267 if ( isset($scores[$pages]) ) |
317 if ( isset($scores[$pages]) ) |
268 { |
318 { |
269 $scores[$pages]++; |
319 $scores[$pages] = $scores[$pages] + $inc; |
270 } |
320 } |
271 else |
321 else |
272 { |
322 { |
273 $scores[$pages] = 1; |
323 $scores[$pages] = $inc; |
274 } |
|
275 if ( isset($word_tracking[$pages]) ) |
|
276 { |
|
277 $word_tracking[$pages][] = $row['word']; |
|
278 } |
|
279 else |
|
280 { |
|
281 $word_tracking[$pages] = array($row['word']); |
|
282 } |
324 } |
283 } |
325 } |
284 } |
326 } |
285 while ( $row = $db->fetchrow() ); |
327 while ( $row = $db->fetchrow() ); |
286 } |
328 } |
287 $db->free_result(); |
329 $db->free_result(); |
288 |
330 |
289 // |
331 // |
290 // STAGE 2: FIRST ELIMINATION ROUND |
332 // STAGE 2: FIRST ELIMINATION ROUND |
291 // Iterate through the list of required terms. If a given page is not found to have the required term, eliminate it |
333 // Iterate through the list of required terms. If a given page is not found to have the required term, eliminate it |
292 // |
334 // |
293 |
335 |
294 foreach ( $query['req'] as $term ) |
336 foreach ( $query['req'] as $term ) |
295 { |
337 { |
296 foreach ( $word_tracking as $i => $page ) |
338 foreach ( $word_tracking as $i => $page ) |
297 { |
339 { |
298 if ( !in_array($term, $page) ) |
340 if ( !in_array($term, $page) ) |
300 unset($word_tracking[$i], $scores[$i]); |
342 unset($word_tracking[$i], $scores[$i]); |
301 } |
343 } |
302 } |
344 } |
303 } |
345 } |
304 } |
346 } |
305 |
347 |
306 // |
348 // |
307 // STAGE 3: PHRASE SEARCHING |
349 // STAGE 3: PHRASE SEARCHING |
308 // Use LIKE to find pages with specified phrases. We can do a super-picky single query without another elimination round because |
350 // Use LIKE to find pages with specified phrases. We can do a super-picky single query without another elimination round because |
309 // at this stage we can search the full page_text column instead of relying on a word list. |
351 // at this stage we can search the full page_text column instead of relying on a word list. |
310 // |
352 // |
311 |
353 |
312 // We can skip this stage if none of these special terms apply |
354 // We can skip this stage if none of these special terms apply |
313 |
355 |
314 $text_col = ( $case_sensitive ) ? 'page_text' : 'lcase(page_text)'; |
356 $text_col = ( $case_sensitive ) ? 'page_text' : 'lcase(page_text)'; |
315 |
357 $name_col = ( $case_sensitive ) ? 'name' : 'lcase(name)'; |
|
358 $text_col_join = ( $case_sensitive ) ? 't.page_text' : 'lcase(t.page_text)'; |
|
359 $name_col_join = ( $case_sensitive ) ? 'p.name' : 'lcase(p.name)'; |
|
360 |
316 if ( count($query_phrase['any']) > 0 || count($query_phrase['req']) > 0 ) |
361 if ( count($query_phrase['any']) > 0 || count($query_phrase['req']) > 0 ) |
317 { |
362 { |
318 |
363 |
319 $where_any = array(); |
364 $where_any = array(); |
320 foreach ( $query_phrase['any'] as $term ) |
365 foreach ( $query_phrase['any'] as $term ) |
321 { |
366 { |
322 $term = escape_string_like($term); |
367 $term = escape_string_like($term); |
323 if ( !$case_sensitive ) |
368 if ( !$case_sensitive ) |
324 $term = strtolower($term); |
369 $term = strtolower($term); |
325 $where_any[] = $term; |
370 $where_any[] = "( $text_col LIKE '%$term%' OR $name_col LIKE '%$term%' )"; |
326 } |
371 } |
327 |
372 |
328 $where_any = ( count($where_any) > 0 ) ? "( $text_col LIKE '%" . implode("%' OR $text_col LIKE '%", $where_any) . "%' )" : ''; |
373 $where_any = ( count($where_any) > 0 ) ? implode(" OR\n ", $where_any) : ''; |
329 |
374 |
330 // Also do required columns, but use AND to ensure that all required terms are included |
375 // Also do required terms, but use AND to ensure that all required terms are included |
331 $where_req = array(); |
376 $where_req = array(); |
332 foreach ( $query_phrase['req'] as $term ) |
377 foreach ( $query_phrase['req'] as $term ) |
333 { |
378 { |
334 $term = escape_string_like($term); |
379 $term = escape_string_like($term); |
335 if ( !$case_sensitive ) |
380 if ( !$case_sensitive ) |
336 $term = strtolower($term); |
381 $term = strtolower($term); |
337 $where_req[] = $term; |
382 $where_req[] = "( $text_col LIKE '%$term%' OR $name_col LIKE '%$term%' )"; |
338 } |
383 } |
339 $and_clause = ( $where_any != '' ) ? 'AND ' : ''; |
384 $and_clause = ( $where_any != '' ) ? 'AND ' : ''; |
340 $where_req = ( count($where_req) > 0 ) ? "{$and_clause}$text_col LIKE '%" . implode("%' AND $text_col LIKE '%", $where_req) . "%'" : ''; |
385 $where_req = ( count($where_req) > 0 ) ? "{$and_clause}" . implode(" AND\n ", $where_req) : ''; |
341 |
386 |
342 $sql = 'SELECT CONCAT("ns=",namespace,";pid=",page_id) AS id FROM ' . table_prefix . "page_text WHERE $where_any $where_req;"; |
387 $sql = 'SELECT CONCAT("ns=",t.namespace,";pid=",t.page_id) AS id, p.name FROM ' . table_prefix . "page_text AS t\n" |
|
388 . " LEFT JOIN " . table_prefix . "pages AS p\n" |
|
389 . " ON ( p.urlname = t.page_id AND p.namespace = t.namespace )\n" |
|
390 . " WHERE\n $where_any\n $where_req;"; |
343 if ( !($q = $db->sql_unbuffered_query($sql)) ) |
391 if ( !($q = $db->sql_unbuffered_query($sql)) ) |
344 $db->_die('Error is in perform_search(), includes/search.php, query 2. Parsed query dump follows:<pre>(indexable) ' . htmlspecialchars(print_r($query, true)) . '(non-indexable) ' . htmlspecialchars(print_r($query_phrase, true)) . '</pre>'); |
392 $db->_die('Error is in perform_search(), includes/search.php, query 2. Parsed query dump follows:<pre>(indexable) ' . htmlspecialchars(print_r($query, true)) . '(non-indexable) ' . htmlspecialchars(print_r($query_phrase, true)) . '</pre>'); |
345 |
393 |
346 if ( $row = $db->fetchrow() ) |
394 if ( $row = $db->fetchrow() ) |
347 { |
395 { |
348 do |
396 do |
349 { |
397 { |
350 $id =& $row['id']; |
398 $id =& $row['id']; |
|
399 $inc = 1; |
|
400 |
|
401 // Is this search term present in the page's title? If so, give extra points |
|
402 preg_match("/^ns=$ns_list;pid=(.+)$/", $id, $piecesparts); |
|
403 $pathskey = $paths->nslist[ $piecesparts[1] ] . sanitize_page_id($piecesparts[2]); |
|
404 if ( isset($paths->pages[$pathskey]) ) |
|
405 { |
|
406 $test_func = ( $case_sensitive ) ? 'strstr' : 'stristr'; |
|
407 foreach ( array_merge($query_phrase['any'], $query_phrase['req']) as $term ) |
|
408 { |
|
409 if ( $test_func($paths->pages[$pathskey]['name'], $term) || $test_func($paths->pages[$pathskey]['urlname_nons'], $term) ) |
|
410 { |
|
411 $inc = 1.5; |
|
412 break; |
|
413 } |
|
414 } |
|
415 } |
351 if ( isset($scores[$id]) ) |
416 if ( isset($scores[$id]) ) |
352 { |
417 { |
353 $scores[$id]++; |
418 $scores[$id] = $scores[$id] + $inc; |
354 } |
419 } |
355 else |
420 else |
356 { |
421 { |
357 $scores[$id] = 1; |
422 $scores[$id] = $inc; |
358 } |
423 } |
359 } |
424 } |
360 while ( $row = $db->fetchrow() ); |
425 while ( $row = $db->fetchrow() ); |
361 } |
426 } |
362 $db->free_result(); |
427 $db->free_result(); |
363 } |
428 } |
364 |
429 |
365 // |
430 // |
366 // STAGE 4 - SELECT PAGE TEXT AND ELIMINATE NOTS |
431 // STAGE 4 - SELECT PAGE TEXT AND ELIMINATE NOTS |
367 // At this point, we have a complete list of all the possible pages. Now we want to obtain the page text, and within the same query |
432 // At this point, we have a complete list of all the possible pages. Now we want to obtain the page text, and within the same query |
368 // eliminate any terms that shouldn't be in there. |
433 // eliminate any terms that shouldn't be in there. |
369 // |
434 // |
370 |
435 |
371 // Generate master word list for the highlighter |
436 // Generate master word list for the highlighter |
372 $word_list = array_values(array_merge($query['any'], $query['req'], $query_phrase['any'], $query_phrase['req'])); |
437 $word_list = array_values(array_merge($query['any'], $query['req'], $query_phrase['any'], $query_phrase['req'])); |
373 |
438 |
374 $text_where = array(); |
439 $text_where = array(); |
375 foreach ( $scores as $page_id => $_ ) |
440 foreach ( $scores as $page_id => $_ ) |
376 { |
441 { |
377 $text_where[] = $db->escape($page_id); |
442 $text_where[] = $db->escape($page_id); |
378 } |
443 } |
379 $text_where = '( CONCAT("ns=",t.namespace,";pid=",t.page_id) = \'' . implode('\' OR CONCAT("ns=",t.namespace,";pid=",t.page_id) = \'', $text_where) . '\' )'; |
444 $text_where = '( CONCAT("ns=",t.namespace,";pid=",t.page_id) = \'' . implode('\' OR CONCAT("ns=",t.namespace,";pid=",t.page_id) = \'', $text_where) . '\' )'; |
380 |
445 |
381 if ( count($query['not']) > 0 ) |
446 if ( count($query['not']) > 0 ) |
382 $text_where .= ' AND'; |
447 $text_where .= ' AND'; |
383 |
448 |
384 $where_not = array(); |
449 $where_not = array(); |
385 foreach ( $query['not'] as $term ) |
450 foreach ( $query['not'] as $term ) |
386 { |
451 { |
387 $term = escape_string_like($term); |
452 $term = escape_string_like($term); |
388 if ( !$case_sensitive ) |
453 if ( !$case_sensitive ) |
389 $term = strtolower($term); |
454 $term = strtolower($term); |
390 $where_not[] = $term; |
455 $where_not[] = $term; |
391 } |
456 } |
392 $where_not = ( count($where_not) > 0 ) ? "$text_col NOT LIKE '%" . implode("%' AND $text_col NOT LIKE '%", $where_not) . "%'" : ''; |
457 $where_not = ( count($where_not) > 0 ) ? "$text_col NOT LIKE '%" . implode("%' AND $text_col NOT LIKE '%", $where_not) . "%'" : ''; |
393 |
458 |
394 $sql = 'SELECT CONCAT("ns=",t.namespace,";pid=",t.page_id) AS id, t.page_id, t.namespace, CHAR_LENGTH(t.page_text) AS page_length, t.page_text, p.name AS page_name FROM ' . table_prefix . "page_text AS t |
459 $sql = 'SELECT CONCAT("ns=",t.namespace,";pid=",t.page_id) AS id, t.page_id, t.namespace, CHAR_LENGTH(t.page_text) AS page_length, t.page_text, p.name AS page_name FROM ' . table_prefix . "page_text AS t |
395 LEFT JOIN " . table_prefix . "pages AS p |
460 LEFT JOIN " . table_prefix . "pages AS p |
396 ON ( p.urlname = t.page_id AND p.namespace = t.namespace ) |
461 ON ( p.urlname = t.page_id AND p.namespace = t.namespace ) |
397 WHERE $text_where $where_not;"; |
462 WHERE $text_where $where_not;"; |
398 if ( !($q = $db->sql_unbuffered_query($sql)) ) |
463 if ( !($q = $db->sql_unbuffered_query($sql)) ) |
399 $db->_die('Error is in perform_search(), includes/search.php, query 3'); |
464 $db->_die('Error is in perform_search(), includes/search.php, query 3'); |
400 |
465 |
401 $page_data = array(); |
466 $page_data = array(); |
402 if ( $row = $db->fetchrow() ) |
467 if ( $row = $db->fetchrow() ) |
403 { |
468 { |
404 do |
469 do |
405 { |
470 { |
406 $row['page_text'] = htmlspecialchars($row['page_text']); |
471 $row['page_text'] = htmlspecialchars($row['page_text']); |
407 $row['page_name'] = htmlspecialchars($row['page_name']); |
472 $row['page_name'] = htmlspecialchars($row['page_name']); |
408 |
473 |
409 // Highlight results (this is wonderfully automated) |
474 // Highlight results (this is wonderfully automated) |
410 $row['page_text'] = highlight_and_clip_search_result($row['page_text'], $word_list, $case_sensitive); |
475 $row['page_text'] = highlight_and_clip_search_result($row['page_text'], $word_list, $case_sensitive); |
411 if ( strlen($row['page_text']) > 250 && !preg_match('/^\.\.\.(.+)\.\.\.$/', $row['page_text']) ) |
476 if ( strlen($row['page_text']) > 250 && !preg_match('/^\.\.\.(.+)\.\.\.$/', $row['page_text']) ) |
412 { |
477 { |
413 $row['page_text'] = substr($row['page_text'], 0, 150) . '...'; |
478 $row['page_text'] = substr($row['page_text'], 0, 150) . '...'; |
414 } |
479 } |
415 $row['page_name'] = highlight_search_result($row['page_name'], $word_list, $case_sensitive); |
480 $row['page_name'] = highlight_search_result($row['page_name'], $word_list, $case_sensitive); |
416 |
481 |
417 $page_data[$row['id']] = $row; |
482 $page_data[$row['id']] = $row; |
418 } |
483 } |
419 while ( $row = $db->fetchrow() ); |
484 while ( $row = $db->fetchrow() ); |
420 } |
485 } |
421 $db->free_result(); |
486 $db->free_result(); |
422 |
487 |
423 // |
488 // |
424 // STAGE 5 - SPECIAL PAGE TITLE SEARCH |
489 // STAGE 5 - SPECIAL PAGE TITLE SEARCH |
425 // Iterate through $paths->pages and check the titles for search terms. Score accordingly. |
490 // Iterate through $paths->pages and check the titles for search terms. Score accordingly. |
426 // |
491 // |
427 |
492 |
428 foreach ( $paths->pages as $page ) |
493 foreach ( $paths->pages as $id => $page ) |
429 { |
494 { |
430 if ( $page['namespace'] != 'Special' ) |
495 if ( $page['namespace'] != 'Special' ) |
431 continue; |
496 continue; |
|
497 if ( !is_int($id) ) |
|
498 continue; |
432 $idstring = 'ns=' . $page['namespace'] . ';pid=' . $page['urlname_nons']; |
499 $idstring = 'ns=' . $page['namespace'] . ';pid=' . $page['urlname_nons']; |
433 $any = array_merge($query['any'], $query_phrase['any']); |
500 $any = array_values(array_unique(array_merge($query['any'], $query_phrase['any']))); |
434 foreach ( $any as $term ) |
501 foreach ( $any as $term ) |
435 { |
502 { |
436 if ( $case_sensitive ) |
503 if ( $case_sensitive ) |
437 { |
504 { |
438 if ( strstr($page['name'], $term) || strstr($page['urlname_nons'], $term) ) |
505 if ( strstr($page['name'], $term) || strstr($page['urlname_nons'], $term) ) |
439 { |
506 { |
440 ( isset($scores[$idstring]) ) ? $scores[$idstring]++ : $scores[$idstring] = 1; |
507 ( isset($scores[$idstring]) ) ? $scores[$idstring] = $scores[$idstring] + 1.5 : $scores[$idstring] = 1.5; |
441 } |
508 } |
442 } |
509 } |
443 else |
510 else |
444 { |
511 { |
445 if ( strstr(strtolower($page['name']), strtolower($term)) || strstr(strtolower($page['urlname_nons']), strtolower($term)) ) |
512 if ( stristr($page['name'], $term) || stristr($page['urlname_nons'], $term) ) |
446 { |
513 { |
447 ( isset($scores[$idstring]) ) ? $scores[$idstring]++ : $scores[$idstring] = 1; |
514 ( isset($scores[$idstring]) ) ? $scores[$idstring] = $scores[$idstring] + 1.5 : $scores[$idstring] = 1.5; |
448 } |
515 } |
449 } |
516 } |
450 } |
517 } |
451 if ( isset($scores[$idstring]) ) |
518 if ( isset($scores[$idstring]) ) |
452 { |
519 { |
476 { |
543 { |
477 unset($page_data[$id]); |
544 unset($page_data[$id]); |
478 } |
545 } |
479 } |
546 } |
480 } |
547 } |
481 |
548 |
482 // At this point, all of our normal results are in. However, we can also allow plugins to hook into the system and score their own |
549 // At this point, all of our normal results are in. However, we can also allow plugins to hook into the system and score their own |
483 // pages and add text, etc. as necessary. |
550 // pages and add text, etc. as necessary. |
484 // Plugins are COMPLETELY responsible for using the search terms and handling Boolean logic properly |
551 // Plugins are COMPLETELY responsible for using the search terms and handling Boolean logic properly |
485 |
552 |
486 $code = $plugins->setHook('search_global_inner'); |
553 $code = $plugins->setHook('search_global_inner'); |
487 foreach ( $code as $cmd ) |
554 foreach ( $code as $cmd ) |
488 { |
555 { |
489 eval($cmd); |
556 eval($cmd); |
490 } |
557 } |
491 |
558 |
492 // a marvelous debugging aid :-) |
559 // a marvelous debugging aid :-) |
493 // die('<pre>' . htmlspecialchars(print_r($page_data, true)) . '</pre>'); |
560 // die('<pre>' . htmlspecialchars(print_r($page_data, true)) . '</pre>'); |
494 |
561 |
495 // |
562 // |
496 // STAGE 7 - HIGHLIGHT, TRIM, AND SCORE RESULTS |
563 // STAGE 7 - HIGHLIGHT, TRIM, AND SCORE RESULTS |
497 // We now have the complete results of the search. We need to trim text down to show only portions of the page containing search |
564 // We now have the complete results of the search. We need to trim text down to show only portions of the page containing search |
498 // terms, highlight any search terms within the page, and sort the final results array in descending order of score. |
565 // terms, highlight any search terms within the page, and sort the final results array in descending order of score. |
499 // |
566 // |
500 |
567 |
501 // Sort scores array |
568 // Sort scores array |
502 arsort($scores); |
569 arsort($scores); |
503 |
570 |
504 // Divisor for calculating relevance scores |
571 // Divisor for calculating relevance scores |
505 $divisor = count($query['any']) + count($query_phrase['any']) + count($query['req']) + count($query_phrase['not']); |
572 $divisor = ( count($query['any']) + count($query_phrase['any']) + count($query['req']) + count($query_phrase['not']) ) * 1.5; |
506 |
573 |
507 foreach ( $scores as $page_id => $score ) |
574 foreach ( $scores as $page_id => $score ) |
508 { |
575 { |
509 if ( !isset($page_data[$page_id]) ) |
576 if ( !isset($page_data[$page_id]) ) |
510 // It's possible that $scores contains a score for a page that was later eliminated because it contained a disallowed term |
577 // It's possible that $scores contains a score for a page that was later eliminated because it contained a disallowed term |
511 continue; |
578 continue; |
512 |
579 |
513 // Make a copy of the datum, then delete the original (it frees up a LOT of RAM) |
580 // Make a copy of the datum, then delete the original (it frees up a LOT of RAM) |
514 $datum = $page_data[$page_id]; |
581 $datum = $page_data[$page_id]; |
515 unset($page_data[$page_id]); |
582 unset($page_data[$page_id]); |
516 |
583 |
517 // This is an internal value used for sorting - it's no longer needed. |
584 // This is an internal value used for sorting - it's no longer needed. |
518 unset($datum['id']); |
585 unset($datum['id']); |
519 |
586 |
520 // Calculate score |
587 // Calculate score |
521 if ( $score > $divisor ) |
588 // if ( $score > $divisor ) |
522 $score = $divisor; |
589 // $score = $divisor; |
523 $datum['score'] = round($score / $divisor, 2) * 100; |
590 $datum['score'] = round($score / $divisor, 2) * 100; |
524 |
591 |
525 // Store it in our until-now-unused results array |
592 // Store it in our until-now-unused results array |
526 $results[] = $datum; |
593 $results[] = $datum; |
527 } |
594 } |
528 |
595 |
529 // Our work here is done. :-D |
596 // Our work here is done. :-D |
530 return $results; |
597 return $results; |
531 } |
598 } |
532 |
599 |
533 /** |
600 /** |
787 $final_chunk = substr($chunk, $j + 1); |
854 $final_chunk = substr($chunk, $j + 1); |
788 break; |
855 break; |
789 } |
856 } |
790 } |
857 } |
791 $mid_chunk = substr($pt, ( $i - 75 ), 75); |
858 $mid_chunk = substr($pt, ( $i - 75 ), 75); |
792 |
859 |
793 $clipped = '...' . $final_chunk . $mid_chunk . $chunk2; |
860 $clipped = '...' . $final_chunk . $mid_chunk . $chunk2; |
794 |
861 |
795 $chunk = substr($pt, ( $i + strlen($chunk2) + 75 )); |
862 $chunk = substr($pt, ( $i + strlen($chunk2) + 75 )); |
796 $final_chunk = $chunk; |
863 $final_chunk = $chunk; |
797 for ( $j = 0; $j < strlen($chunk); $j++ ) |
864 for ( $j = 0; $j < strlen($chunk); $j++ ) |
798 { |
865 { |
799 if ( in_array($chunk{$j}, $space_chars) ) |
866 if ( in_array($chunk{$j}, $space_chars) ) |
800 { |
867 { |
801 $final_chunk = substr($chunk, 0, $j); |
868 $final_chunk = substr($chunk, 0, $j); |
802 break; |
869 break; |
803 } |
870 } |
804 } |
871 } |
805 |
872 |
806 $end_chunk = substr($pt, ( $i + strlen($chunk2) ), 75 ); |
873 $end_chunk = substr($pt, ( $i + strlen($chunk2) ), 75 ); |
807 |
874 |
808 $clipped .= $end_chunk . $final_chunk . '...'; |
875 $clipped .= $end_chunk . $final_chunk . '...'; |
809 |
876 |
810 $pt = $clipped; |
877 $pt = $clipped; |
811 } |
878 } |
812 else if ( strlen($pt) > 200 ) |
879 else if ( strlen($pt) > 200 ) |
813 { |
880 { |
814 $mid_chunk = substr($pt, ( $i - 75 ), 75); |
881 $mid_chunk = substr($pt, ( $i - 75 ), 75); |
815 |
882 |
816 $clipped = $chunk1 . $chunk2; |
883 $clipped = $chunk1 . $chunk2; |
817 |
884 |
818 $chunk = substr($pt, ( $i + strlen($chunk2) + 75 )); |
885 $chunk = substr($pt, ( $i + strlen($chunk2) + 75 )); |
819 $final_chunk = $chunk; |
886 $final_chunk = $chunk; |
820 for ( $j = 0; $j < strlen($chunk); $j++ ) |
887 for ( $j = 0; $j < strlen($chunk); $j++ ) |
821 { |
888 { |
822 if ( in_array($chunk{$j}, $space_chars) ) |
889 if ( in_array($chunk{$j}, $space_chars) ) |
823 { |
890 { |
824 $final_chunk = substr($chunk, 0, $j); |
891 $final_chunk = substr($chunk, 0, $j); |
825 break; |
892 break; |
826 } |
893 } |
827 } |
894 } |
828 |
895 |
829 $end_chunk = substr($pt, ( $i + strlen($chunk2) ), 75 ); |
896 $end_chunk = substr($pt, ( $i + strlen($chunk2) ), 75 ); |
830 |
897 |
831 $clipped .= $end_chunk . $final_chunk . '...'; |
898 $clipped .= $end_chunk . $final_chunk . '...'; |
832 |
899 |
833 $pt = $clipped; |
900 $pt = $clipped; |
834 |
901 |
835 } |
902 } |
836 break 2; |
903 break 2; |
837 } |
904 } |
838 } |
905 } |
839 } |
906 } |
851 function get_stopwords() |
918 function get_stopwords() |
852 { |
919 { |
853 static $stopwords; |
920 static $stopwords; |
854 if ( is_array($stopwords) ) |
921 if ( is_array($stopwords) ) |
855 return $stopwords; |
922 return $stopwords; |
856 |
923 |
857 $stopwords = array('a\'s', 'able', 'after', 'afterwards', 'again', |
924 $stopwords = array('a\'s', 'able', 'after', 'afterwards', 'again', |
858 'against', 'ain\'t', 'all', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', |
925 'against', 'ain\'t', 'all', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', |
859 'am', 'among', 'amongst', 'an', 'and', 'another', 'any', 'anybody', 'anyhow', 'anyone', 'anything', 'anyway', |
926 'am', 'among', 'amongst', 'an', 'and', 'another', 'any', 'anybody', 'anyhow', 'anyone', 'anything', 'anyway', |
860 'anyways', 'anywhere', 'apart', 'appear', 'appreciate', 'appropriate', 'are', 'aren\'t', 'around', 'as', 'aside', |
927 'anyways', 'anywhere', 'apart', 'appear', 'appreciate', 'appropriate', 'are', 'aren\'t', 'around', 'as', 'aside', |
861 'ask', 'asking', 'associated', 'at', 'available', 'away', 'awfully', 'be', 'became', 'because', 'become', 'becomes', |
928 'ask', 'asking', 'associated', 'at', 'available', 'away', 'awfully', 'be', 'became', 'because', 'become', 'becomes', |
887 'same', 'saw', 'say', 'saying', 'says', 'second', 'secondly', 'see', 'seeing', 'seem', 'seemed', 'seeming', 'seems', |
954 'same', 'saw', 'say', 'saying', 'says', 'second', 'secondly', 'see', 'seeing', 'seem', 'seemed', 'seeming', 'seems', |
888 'seen', 'self', 'selves', 'sensible', 'sent', 'serious', 'seriously', 'seven', 'several', 'shall', 'she', 'should', |
955 'seen', 'self', 'selves', 'sensible', 'sent', 'serious', 'seriously', 'seven', 'several', 'shall', 'she', 'should', |
889 'shouldn\'t', 'since', 'six', 'so', 'some', 'somebody', 'somehow', 'someone', 'something', 'sometime', 'sometimes', |
956 'shouldn\'t', 'since', 'six', 'so', 'some', 'somebody', 'somehow', 'someone', 'something', 'sometime', 'sometimes', |
890 'somewhat', 'somewhere', 'soon', 'sorry', 'specified', 'specify', 'specifying', 'still', 'sub', 'such', 'sup', |
957 'somewhat', 'somewhere', 'soon', 'sorry', 'specified', 'specify', 'specifying', 'still', 'sub', 'such', 'sup', |
891 'sure', 't\'s', 'take', 'taken', 'tell', 'tends', 'th', 'than', 'thank', 'thanks', 'thanx', 'that', 'that\'s', |
958 'sure', 't\'s', 'take', 'taken', 'tell', 'tends', 'th', 'than', 'thank', 'thanks', 'thanx', 'that', 'that\'s', |
892 'thats', 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'thence', 'there', 'there\'s', 'thereafter', |
959 'thats', 'the', 'their', 'theirs', 'them', 'then', 'thence', 'there', 'there\'s', 'thereafter', |
893 'thereby', 'therefore', 'therein', 'theres', 'thereupon', 'these', 'they', 'they\'d', 'they\'ll', 'they\'re', |
960 'thereby', 'therefore', 'therein', 'theres', 'thereupon', 'these', 'they', 'they\'d', 'they\'ll', 'they\'re', |
894 'they\'ve', 'think', 'third', 'this', 'thorough', 'thoroughly', 'those', 'though', 'three', 'through', 'throughout', |
961 'they\'ve', 'think', 'third', 'this', 'thorough', 'thoroughly', 'those', 'though', 'three', 'through', 'throughout', |
895 'thru', 'thus', 'to', 'together', 'too', 'took', 'toward', 'towards', 'tried', 'tries', 'truly', 'try', 'trying', |
962 'thru', 'thus', 'to', 'together', 'too', 'took', 'toward', 'towards', 'tried', 'tries', 'truly', 'try', 'trying', |
896 'twice', 'two', 'un', 'under', 'unfortunately', 'unless', 'unlikely', 'until', 'unto', 'up', 'upon', 'us', 'use', |
963 'twice', 'two', 'un', 'under', 'unfortunately', 'unless', 'unlikely', 'until', 'unto', 'upon', 'use', |
897 'used', 'useful', 'uses', 'using', 'usually', 'value', 'various', 'very', 'via', 'viz', 'vs', 'want', 'wants', |
964 'used', 'useful', 'uses', 'using', 'usually', 'value', 'various', 'very', |
898 'was', 'wasn\'t', 'way', 'we', 'we\'d', 'we\'ll', 'we\'re', 'we\'ve', 'welcome', 'well', 'went', 'were', 'weren\'t', |
965 'was', 'wasn\'t', 'way', 'we', 'we\'d', 'we\'ll', 'we\'re', 'we\'ve', 'welcome', 'well', 'went', 'were', 'weren\'t', |
899 'what', 'what\'s', 'whatever', 'when', 'whence', 'whenever', 'where', 'where\'s', 'whereafter', 'whereas', |
966 'what', 'what\'s', 'whatever', 'when', 'whence', 'whenever', 'where', 'where\'s', 'whereafter', 'whereas', |
900 'whereby', 'wherein', 'whereupon', 'wherever', 'whether', 'which', 'while', 'whither', 'who', 'who\'s', 'whoever', |
967 'which', 'while', 'who', 'who\'s', 'whole', 'whom', 'whose', 'why', 'will', 'willing', 'wish', 'with', 'within', |
901 'whole', 'whom', 'whose', 'why', 'will', 'willing', 'wish', 'with', 'within', 'without', 'won\'t', 'wonder', |
968 'without', 'won\'t', 'wonder', 'would', 'would', 'wouldn\'t', 'yes', 'yet', 'you', 'you\'d', 'you\'ll', 'you\'re', |
902 'would', 'would', 'wouldn\'t', 'yes', 'yet', 'you', 'you\'d', 'you\'ll', 'you\'re', 'you\'ve', 'your', 'yours', |
969 'you\'ve', 'your', 'yours', 'zero'); |
903 'yourself', 'yourselves', 'zero'); |
|
904 return $stopwords; |
970 return $stopwords; |
905 } |
971 } |
906 |
972 |
907 ?> |
973 ?> |