Making WordPress.org


Ignore:
Timestamp:
04/27/2017 06:11:16 PM (7 years ago)
Author:
iandunn
Message:

Events: Refactor the ideographic fallback for use with ASCII queries too

This was originally intended for ideographic languages, but there are new edge cases where it is helpful for ASCII queries as well.

See https://github.com/coreymckrill/nearby-wordpress-events/issues/37

File:
1 edited

Legend:

Unmodified
Added
Removed
  • sites/trunk/api.wordpress.org/public_html/events/1.0/index.php

    r5409 r5417  
    212212
    213213    if ( ! is_a( $row, 'stdClass' ) && 'ASCII' !== mb_detect_encoding( $location_name ) ) {
    214         $row = guess_ideographic_location_from_geonames( $location_name, $country, $timezone );
     214        $row = guess_location_from_geonames_fallback( $location_name, $country, $timezone, 'exact', 'ideographic' );
    215215    }
    216216
     
    219219
    220220/**
    221  * Look for the given ideographic location in the Geonames database
     221 * Look for the given location in the Geonames database using a LIKE query
    222222 *
    223223 * This is a fallback for situations where the full-text search in `guess_location_from_geonames()` resulted
    224  * in a false-negative. MySQL < 5.7.6 doesn't support full-text searches on ideographic languages, because
    225  * it cannot determine where the word boundaries are.
    226  *
     224 * in a false-negative.
     225 *
     226 * One situation where this happens is with queries in ideographic languages, because MySQL < 5.7.6 doesn't
     227 * support full-text searches for them, because it can't determine where the word boundaries are.
    227228 * See https://dev.mysql.com/doc/refman/5.7/en/fulltext-restrictions.html
     229 *
     230 * There are also edge cases where the exact query doesn't exist in the database, but a loose LIKE query will find
     231 * a similar alternate, like `Osakashi`.
    228232 *
    229233 * @param string $location_name
     
    232236 * @param string $mode          'exact' to only return exact matches from the database;
    233237 *                              'loose' to return any match. This has a high chance of false positives.
     238 * @param string $restrict_counties 'ideographic' to only search in countries where ideographic languages are common;
     239 *                                  'none' to search all countries
    234240 *
    235241 * @return stdClass|null
    236242 */
    237 function guess_ideographic_location_from_geonames( $location_name, $country, $timezone, $mode = 'exact' ) {
     243function guess_location_from_geonames_fallback( $location_name, $country, $timezone, $mode = 'exact', $restrict_counties = 'ideographic' ) {
    238244    global $wpdb;
    239245
    240     $ideographic_countries            = get_ideographic_counties();
    241     $ideographic_country_placeholders = get_prepare_placeholders( count( $ideographic_countries ), '%s' );
     246    $where = $ideographic_countries = $ideographic_country_placeholders = '';
    242247
    243248    /*
     
    256261    );
    257262
     263    $prepare_args = array( $escaped_location_name, $country, $timezone );
     264
     265    if ( 'ideographic' == $restrict_counties ) {
     266        $ideographic_countries            = get_ideographic_counties();
     267        $ideographic_country_placeholders = get_prepare_placeholders( count( $ideographic_countries ), '%s' );
     268
     269        $where .= "country IN ( $ideographic_country_placeholders ) AND";
     270
     271        $prepare_args = array_merge( $ideographic_countries, $prepare_args );
     272    }
     273
    258274    /*
    259275     * REPLACE() is used because sometimes the `alternatenames` column contains entries where the `asciiname` is
     
    270286        FROM `geoname`
    271287        WHERE
    272             country IN ( $ideographic_country_placeholders ) AND
     288            $where
    273289            REPLACE( alternatenames, CONCAT( asciiname, ' - ' ), '' ) LIKE %s
    274290        ORDER BY
     
    278294        LIMIT 1";
    279295
    280     $prepared_query = $wpdb->prepare(
    281         $query,
    282         array_merge( $ideographic_countries, array( $escaped_location_name, $country, $timezone ) )
    283     );
     296    $prepared_query = $wpdb->prepare( $query, $prepare_args );
    284297
    285298    return $wpdb->get_row( $prepared_query );
     
    444457
    445458    /*
    446      * If all else fails for a non-ASCII request, cast a wide net and try to find something before giving up, even
     459     * If all else fails, cast a wide net and try to find something before giving up, even
    447460     * if the chance of success if lower than normal. Returning false is guaranteed failure, so this improves things
    448461     * even if it only works 10% of the time.
     
    450463     * This must be done as the very last thing before giving up, because the likelihood of false positives is high.
    451464     */
    452     if ( ! $location && isset( $args['location_name'] ) && 'ASCII' !== mb_detect_encoding( $args['location_name'] ) ) {
    453         $guess = guess_ideographic_location_from_geonames( $args['location_name'], $country_code, $args['timezone'] ?? '', 'loose' );
     465    if ( ! $location && isset( $args['location_name'] ) ) {
     466        if ( 'ASCII' === mb_detect_encoding( $args['location_name'] ) ) {
     467            $guess = guess_location_from_geonames_fallback( $args['location_name'], $country_code, $args['timezone'] ?? '', 'loose', 'none' );
     468        } else {
     469            $guess = guess_location_from_geonames_fallback( $args['location_name'], $country_code, $args['timezone'] ?? '', 'loose', 'ideographic' );
     470        }
    454471
    455472        if ( $guess ) {
Note: See TracChangeset for help on using the changeset viewer.