Making WordPress.org

Changeset 5543


Ignore:
Timestamp:
06/06/2017 05:39:24 PM (8 years ago)
Author:
iandunn
Message:

Events: Query new geoname_summary table to improve performance

See #2823
Props dd32

Location:
sites/trunk/api.wordpress.org/public_html/events/1.0
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • sites/trunk/api.wordpress.org/public_html/events/1.0/index.php

    r5541 r5543  
    187187function guess_location_from_city( $location_name, $timezone, $country_code ) {
    188188    $guess = guess_location_from_geonames( $location_name, $timezone, $country_code );
    189     $location_word_count = str_word_count( $location_name );
    190     $location_name_parts = explode( ' ', $location_name );
     189
     190    if ( $guess ) {
     191        return $guess;
     192    }
    191193
    192194    /*
     
    194196     *
    195197     * This won't work for most ideographic languages, because they don't use the space character as a word
    196      * delimiter. That's ok, though, because `guess_ideographic_location_from_geonames()` should cover those
    197      * cases.
     198     * delimiter.
    198199     */
     200    $location_name_parts = preg_split( '/\s+/u', $location_name );
     201    $location_word_count = count( $location_name_parts );
     202
    199203    if ( ! $guess && $location_word_count >= 2 ) {
    200204        // Catch input like "Portland Maine"
    201         $guess = guess_location_from_geonames( $location_name_parts[0], $timezone, $country_code );
     205        $guess = guess_location_from_geonames( $location_name_parts[0], $timezone, $country_code, $wildcard = false );
    202206    }
    203207
     
    205209        // Catch input like "Sao Paulo Brazil"
    206210        $city_name = sprintf( '%s %s', $location_name_parts[0], $location_name_parts[1] );
    207         $guess     = guess_location_from_geonames( $city_name, $timezone, $country_code );
    208     }
    209 
    210     // Normalize all errors to boolean false for consistency
    211     if ( empty ( $guess ) ) {
    212         $guess = false;
     211        $guess     = guess_location_from_geonames( $city_name, $timezone, $country_code, $wildcard = false );
    213212    }
    214213
     
    225224 * @return stdClass|null
    226225 */
    227 function guess_location_from_geonames( $location_name, $timezone, $country ) {
     226function guess_location_from_geonames( $location_name, $timezone, $country, $wildcard = true ) {
    228227    global $wpdb;
    229228    // Look for a location that matches the name.
     
    231230    // And we sort by population desc, assuming that the biggest matching location is the most likely one.
    232231
    233     // Strip all quotes from the search query, and then enclose it in double quotes, to force an exact literal search
    234     $quoted_location_name = sprintf(
    235         '"%s"',
    236         strtr( $location_name, [ '"' => '', "'" => '' ] )
    237     );
    238 
     232    // Exact match
    239233    $row = $wpdb->get_row( $wpdb->prepare( "
    240234        SELECT name, latitude, longitude, country
    241         FROM geoname
    242         WHERE
    243             MATCH( name, asciiname, alternatenames )
    244             AGAINST( %s IN BOOLEAN MODE )
     235        FROM geoname_summary
     236        WHERE name = %s
    245237        ORDER BY
    246238            FIELD( %s, country  ) DESC,
     
    248240            population DESC
    249241        LIMIT 1",
    250         $quoted_location_name,
     242        $location_name,
    251243        $country,
    252244        $timezone
    253245    ) );
    254246
    255     if ( ! is_a( $row, 'stdClass' ) && 'ASCII' !== mb_detect_encoding( $location_name ) ) {
    256         $row = guess_location_from_geonames_fallback( $location_name, $country, $timezone, 'exact', 'ideographic' );
    257     }
     247    // Wildcard match
     248    if ( ! $row && $wildcard && 'ASCII' !== mb_detect_encoding( $location_name ) ) {
     249        $row = $wpdb->get_row( $wpdb->prepare( "
     250            SELECT name, latitude, longitude, country
     251            FROM geoname_summary
     252            WHERE name LIKE %s
     253            ORDER BY
     254                FIELD( %s, country  ) DESC,
     255                FIELD( %s, timezone ) DESC,
     256                population DESC
     257            LIMIT 1",
     258            $location_name . '%',
     259            $country,
     260            $timezone
     261        ) );
     262    }
     263
     264    // Suffix the "State", good in some countries (western countries) horrible in others
     265    // (where geonames data is not as complete, or region names are similar (but not quite the same) to city names)
     266    // LEFT JOIN admin1codes ac ON gs.statecode = ac.code
     267    // if ( $row->state && $row->state != $row->name && $row->name NOT CONTAINED WITHIN $row->state? ) {
     268    //   $row->name .= ', ' . $row->state;
     269    // }
    258270
    259271    return $row;
    260272}
    261273
    262 /**
    263  * Look for the given location in the Geonames database using a LIKE query
    264  *
    265  * This is a fallback for situations where the full-text search in `guess_location_from_geonames()` resulted
    266  * in a false-negative.
    267  *
    268  * One situation where this happens is with queries in ideographic languages, because MySQL < 5.7.6 doesn't
    269  * support full-text searches for them, because it can't determine where the word boundaries are.
    270  * See https://dev.mysql.com/doc/refman/5.7/en/fulltext-restrictions.html
    271  *
    272  * There are also edge cases where the exact query doesn't exist in the database, but a loose LIKE query will find
    273  * a similar alternate, like `Osakashi`.
    274  *
    275  * @param string $location_name
    276  * @param string $country
    277  * @param string $timezone
    278  * @param string $mode          'exact' to only return exact matches from the database;
    279  *                              'loose' to return any match. This has a high chance of false positives.
    280  * @param string $restrict_counties 'ideographic' to only search in countries where ideographic languages are common;
    281  *                                  'none' to search all countries
    282  *
    283  * @return stdClass|null
    284  */
    285 function guess_location_from_geonames_fallback( $location_name, $country, $timezone, $mode = 'exact', $restrict_counties = 'ideographic' ) {
    286     global $wpdb;
    287 
    288     $where = $ideographic_countries = $ideographic_country_placeholders = '';
    289 
    290     /*
    291      * The name is wrapped in commas in order to ensure that we're only matching the exact location, which is
    292      * delimited by commas. Otherwise, there would be false positives in situations where `$location_name`
    293      * appears in other rows, which happens sometimes.
    294      *
    295      * Because this will only match entries that are prefixed _and_ postfixed with a comma, it will never match the
    296      * first and last entries in the column. That's ok, though, because the first entry is often an airport code
    297      * in English, which is shorter than `ft_min_word_len` anyway. The last entry is often ideographic, so it'd be nice
    298      * to match it, but this is good enough for now.
    299      */
    300     $escaped_location_name = sprintf(
    301         'loose' === $mode ? '%%%s%%' : '%%,%s,%%',
    302         $wpdb->esc_like( $location_name )
    303     );
    304 
    305     $prepare_args = array( $escaped_location_name, $country, $timezone );
    306 
    307     if ( 'ideographic' == $restrict_counties ) {
    308         $ideographic_countries            = get_ideographic_counties();
    309         $ideographic_country_placeholders = get_prepare_placeholders( count( $ideographic_countries ), '%s' );
    310 
    311         $where .= "country IN ( $ideographic_country_placeholders ) AND";
    312 
    313         $prepare_args = array_merge( $ideographic_countries, $prepare_args );
    314     }
    315 
    316     /*
    317      * REPLACE() is used because sometimes the `alternatenames` column contains entries where the `asciiname` is
    318      * prefixed to an ideographic name; for example: `,Karachi - كراچى,`
    319      *
    320      * If that prefix is not removed, then the LIKE query will fail in those cases, because
    321      * `$escaped_location_name` is wrapped in commas.
    322      *
    323      * The query is restricted to countries where ideographic languages are common, in order to avoid a full-table
    324      * scan.
    325      */
    326     $query = "
    327         SELECT name, latitude, longitude, country
    328         FROM `geoname`
    329         WHERE
    330             $where
    331             REPLACE( alternatenames, CONCAT( asciiname, ' - ' ), '' ) LIKE %s
    332         ORDER BY
    333             FIELD( %s, country  ) DESC,
    334             FIELD( %s, timezone ) DESC,
    335             population DESC
    336         LIMIT 1";
    337 
    338     $prepared_query = $wpdb->prepare( $query, $prepare_args );
    339 
    340     return $wpdb->get_row( $prepared_query );
    341 }
    342 
    343 /**
    344  * Get an array of countries where ideographic languages are common
    345  *
    346  * Derived from https://en.wikipedia.org/wiki/List_of_writing_systems#List_of_writing_scripts_by_adoption
    347  *
    348  * @todo Some of these individual countries may be able to be removed, to further narrow the rows that need to be
    349  *       scanned by `guess_ideographic_location_from_geonames()`. Some of the entire categories could possibly be
    350  *       removed too, but let's err on the side of caution for now.
    351  */
    352 function get_ideographic_counties() {
    353     $middle_east  = array( 'AE', 'BH', 'CY', 'EG', 'IL', 'IR', 'IQ', 'JO', 'KW', 'LB', 'OM', 'PS', 'QA', 'SA', 'SY', 'TR', 'YE' );
    354     $north_africa = array( 'DZ', 'EH', 'EG', 'LY', 'MA', 'SD', 'SS', 'TN' );
    355 
    356     $abjad_countries       = array_merge( $middle_east, $north_africa, array( 'CN', 'IL', 'IN', 'MY', 'PK' ) );
    357     $abugida_countries     = array( 'BD', 'BT', 'ER', 'ET', 'ID', 'IN', 'KH', 'LA', 'LK', 'MV', 'MY', 'MU', 'MM', 'NP', 'PK', 'SG', 'TH' );
    358     $logographic_countries = array( 'CN', 'JP', 'KR', 'MY', 'SG');
    359 
    360     $all_ideographic_countries = array_merge( $abjad_countries, $abugida_countries, $logographic_countries );
    361 
    362     return array_unique( $all_ideographic_countries );
    363 }
    364 
    365 /**
    366  * Build a string of placeholders to pass to `WPDB::prepare()`
    367  *
    368  * Sometimes it's convenient to be able to generate placeholders for `prepare()` dynamically. For example, when
    369  * looping through a multi-dimensional array where the sub-arrays have distinct counts; or when the total
    370  * number of items is too large to conveniently count by hand.
    371  *
    372  * See https://iandunn.name/2016/03/31/generating-dynamic-placeholders-for-wpdb-prepare/
    373  *
    374  * @param int    $number The number of placeholders needed
    375  * @param string $format An sprintf()-like format accepted by WPDB::prepare()
    376  *
    377  * @return string
    378  */
    379 function get_prepare_placeholders( $number, $format ) {
    380     return implode( ', ', array_fill( 0, $number, $format ) );
    381 }
    382274
    383275/**
     
    580472                );
    581473            }
    582         }
    583     }
    584 
    585     /*
    586      * If all else fails, cast a wide net and try to find something before giving up, even
    587      * if the chance of success if lower than normal. Returning false is guaranteed failure, so this improves things
    588      * even if it only works 10% of the time.
    589      *
    590      * This must be done as the very last thing before giving up, because the likelihood of false positives is high.
    591      */
    592     if ( ! $location && isset( $args['location_name'] ) ) {
    593         if ( 'ASCII' === mb_detect_encoding( $args['location_name'] ) ) {
    594             $guess = guess_location_from_geonames_fallback( $args['location_name'], $country_code, $args['timezone'] ?? '', 'loose', 'none' );
    595         } else {
    596             $guess = guess_location_from_geonames_fallback( $args['location_name'], $country_code, $args['timezone'] ?? '', 'loose', 'ideographic' );
    597         }
    598 
    599         if ( $guess ) {
    600             $location = array(
    601                 'description' => $guess->name,
    602                 'latitude'    => $guess->latitude,
    603                 'longitude'   => $guess->longitude,
    604                 'country'     => $guess->country,
    605             );
    606474        }
    607475    }
  • sites/trunk/api.wordpress.org/public_html/events/1.0/tests/test-index.php

    r5501 r5543  
    154154        ),
    155155
    156         /*
    157          * This is matching a city inside the country before it the country searches run, but that's ok since it's
    158          * good enough for our use cases
    159          */
    160156        'country-exonym-2-words' => array(
    161157            'input' => array(
     
    165161            ),
    166162            'expected' => array(
    167                 'description' => 'pale',
    168                 'latitude'    => '43.817',
    169                 'longitude'   => '18.569',
    170                 'country'     => 'BA'
     163                'country'     => 'BA',
     164                'description' => 'bosnia and herzegovina',
    171165            ),
    172166        ),
     
    288282            ),
    289283            'expected' => array(
    290                 'description' => 'são paulo',
     284                'description' => 'sao paulo',
    291285                'latitude'    => '-23.548',
    292286                'longitude'   => '-46.636',
     
    303297            ),
    304298            'expected' => array(
    305                 'description' => 'osaka',
     299                'description' => 'osakashi',
    306300                'latitude'    => '34.694',
    307301                'longitude'   => '135.502',
     
    319313            ),
    320314            'expected' => array(
    321                 'description' => 'osaka',
     315                'description' => 'osakashi',
    322316                'latitude'    => '34.694',
    323317                'longitude'   => '135.502',
     
    366360            ),
    367361            'expected' => array(
    368                 'description' => "doña ana",
     362                'description' => "dona ana",
    369363                'latitude'    => '32.390',
    370364                'longitude'   => '-106.814',
     
    394388            ),
    395389            'expected' => array(
    396                 'description' => "st. louis",
     390                'description' => "st louis",
    397391                'latitude'    => '38.627',
    398392                'longitude'   => '-90.198',
     
    429423            ),
    430424            'expected' => array(
    431                 'description' => 'addis ababa',
     425                'description' => 'አዲስ አበባ',
    432426                'latitude'    => '9.025',
    433427                'longitude'   => '38.747',
     
    443437            ),
    444438            'expected' => array(
    445                 'description' => 'shirahamachō-usazakiminami',
     439                'description' => '白浜町宇佐崎南',
    446440                'latitude'    => '34.783',
    447441                'longitude'   => '134.717',
     
    457451            ),
    458452            'expected' => array(
    459                 'description' => 'tehran',
     453                'description' => 'تهران',
    460454                'latitude'    => '35.694',
    461455                'longitude'   => '51.422',
     
    471465            ),
    472466            'expected' => array(
    473                 'description' => 'karachi',
     467                'description' => 'كراچى',
    474468                'latitude'    => '24.906',
    475469                'longitude'   => '67.082',
     
    485479            ),
    486480            'expected' => array(
    487                 'description' => 'kyoto',
     481                'description' => '京都',
    488482                'latitude'    => '35.021',
    489483                'longitude'   => '135.754',
     
    499493            ),
    500494            'expected' => array(
    501                 'description' => 'tokyo',
     495                'description' => '東京',
    502496                'latitude'    => '35.690',
    503497                'longitude'   => '139.692',
     
    514508            ),
    515509            'expected' => array(
    516                 'description' => 'osaka',
     510                'description' => '大阪市',
    517511                'latitude'    => '34.694',
    518512                'longitude'   => '135.502',
     
    528522            ),
    529523            'expected' => array(
    530                 'description' => 'vienna',
     524                'description' => 'wien',
    531525                'latitude'    => '48.208',
    532526                'longitude'   => '16.372',
     
    542536            ),
    543537            'expected' => array(
    544                 'description' => 'moscow',
     538                'description' => 'Москва',
    545539                'latitude'    => '55.752',
    546540                'longitude'   => '37.616',
     
    556550            ),
    557551            'expected' => array(
    558                 'description' => 'mexico city',
     552                'description' => 'ciudad de méxico',
    559553                'latitude'    => '19.428',
    560554                'longitude'   => '-99.128',
Note: See TracChangeset for help on using the changeset viewer.