Making WordPress.org

Changeset 5277


Ignore:
Timestamp:
04/07/2017 05:20:54 AM (8 years ago)
Author:
iandunn
Message:

Events: Fallback to a LIKE query for ideographic languages

MySQL < 5.7.6 doesn't support full-text queries for ideographic languages, like Japanese.

Location:
sites/trunk/api.wordpress.org/public_html/events/1.0
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • sites/trunk/api.wordpress.org/public_html/events/1.0/index.php

    r5275 r5277  
    150150    /*
    151151     * Multi-word queries may contain cities, regions, and countries, so try to extract just the city
     152     *
     153     * This won't work for most ideographic languages, because they don't use the space character as a word
     154     * delimiter. That's ok, though, because `guess_ideographic_location_from_geonames()` should cover those
     155     * cases.
    152156     */
    153157    if ( ! $guess && $location_word_count >= 2 ) {
     
    207211    ) );
    208212
     213    if ( ! is_a( $row, 'stdClass' ) && 'ASCII' !== mb_detect_encoding( $location_name ) ) {
     214        $row = guess_ideographic_location_from_geonames( $location_name, $country, $timezone );
     215    }
     216
    209217    return $row;
     218}
     219
     220/**
     221 * Look for the given ideographic location in the Geonames database
     222 *
     223 * This is a fallback for situations where the full-text search in `guess_location_from_geonames()` resulted
     224 * in a false-negative. MySQL < 5.7.6 doesn't support full-text searches on ideographic languages, because
     225 * it cannot determine where the word boundaries are.
     226 *
     227 * See https://dev.mysql.com/doc/refman/5.7/en/fulltext-restrictions.html
     228 *
     229 * @param string $location_name
     230 * @param string $country
     231 * @param string $timezone
     232 *
     233 * @return stdClass|null
     234 */
     235function guess_ideographic_location_from_geonames( $location_name, $country, $timezone ) {
     236    global $wpdb;
     237
     238    $ideographic_countries            = get_ideographic_counties();
     239    $ideographic_country_placeholders = get_prepare_placeholders( count( $ideographic_countries ), '%s' );
     240
     241    /*
     242     * The name is wrapped in commas in order to ensure that we're only matching the exact location, which is
     243     * delimited by commas. Otherwise, there would be false positives in situations where `$location_name`
     244     * appears in other rows, which happens sometimes.
     245     *
     246     * Because this will only match entries that are prefixed _and_ postfixed with a comma, it will never match the
     247     * first and last entries in the column. That's ok, though, because the first entry is always an airport code
     248     * in English, which will be matched by other functions. The last entry is often ideographic, so it'd be nice
     249     * to match it, but this is good enough for now.
     250     */
     251    $escaped_location_name = sprintf( '%%,%s,%%', $wpdb->esc_like( $location_name ) );
     252
     253    /*
     254     * REPLACE() is used because sometimes the `alternatenames` column contains entries where the `asciiname` is
     255     * prefixed to an ideographic name; for example: `,Karachi - كراچى,`
     256     *
     257     * If that prefix is not removed, then the LIKE query will fail in those cases, because
     258     * `$escaped_location_name` is wrapped in commas.
     259     *
     260     * The query is restricted to countries where ideographic languages are common, in order to avoid a full-table
     261     * scan.
     262     */
     263    $query = "
     264        SELECT name, latitude, longitude, country
     265        FROM `geoname`
     266        WHERE
     267            country IN ( $ideographic_country_placeholders ) AND
     268            REPLACE( alternatenames, CONCAT( asciiname, ' - ' ), '' ) LIKE %s
     269        ORDER BY
     270            FIELD( %s, country  ) DESC,
     271            FIELD( %s, timezone ) DESC,
     272            population DESC
     273        LIMIT 1";
     274
     275    $prepared_query = $wpdb->prepare(
     276        $query,
     277        array_merge( $ideographic_countries, array( $escaped_location_name, $country, $timezone ) )
     278    );
     279
     280    return $wpdb->get_row( $prepared_query );
     281}
     282
     283/**
     284 * Get an array of countries where ideographic languages are common
     285 *
     286 * Derived from https://en.wikipedia.org/wiki/List_of_writing_systems#List_of_writing_scripts_by_adoption
     287 *
     288 * @todo Some of these individual countries may be able to be removed, to further narrow the rows that need to be
     289 *       scanned by `guess_ideographic_location_from_geonames()`. Some of the entire categories could possibly be
     290 *       removed too, but let's err on the side of caution for now.
     291 */
     292function get_ideographic_counties() {
     293    $middle_east  = array( 'AE', 'BH', 'CY', 'EG', 'IL', 'IR', 'IQ', 'JO', 'KW', 'LB', 'OM', 'PS', 'QA', 'SA', 'SY', 'TR', 'YE' );
     294    $north_africa = array( 'DZ', 'EH', 'EG', 'LY', 'MA', 'SD', 'SS', 'TN' );
     295
     296    $abjad_countries       = array_merge( $middle_east, $north_africa, array( 'CN', 'IL', 'IN', 'MY', 'PK' ) );
     297    $abugida_countries     = array( 'BD', 'BT', 'ER', 'ET', 'ID', 'IN', 'KH', 'LA', 'LK', 'MV', 'MY', 'MU', 'MM', 'NP', 'PK', 'SG', 'TH' );
     298    $logographic_countries = array( 'CN', 'JP', 'KR', 'MY', 'SG');
     299
     300    $all_ideographic_countries = array_merge( $abjad_countries, $abugida_countries, $logographic_countries );
     301
     302    return array_unique( $all_ideographic_countries );
     303}
     304
     305/**
     306 * Build a string of placeholders to pass to `WPDB::prepare()`
     307 *
     308 * Sometimes it's convenient to be able to generate placeholders for `prepare()` dynamically. For example, when
     309 * looping through a multi-dimensional array where the sub-arrays have distinct counts; or when the total
     310 * number of items is too large to conveniently count by hand.
     311 *
     312 * See https://iandunn.name/2016/03/31/generating-dynamic-placeholders-for-wpdb-prepare/
     313 *
     314 * @param int    $number The number of placeholders needed
     315 * @param string $format An sprintf()-like format accepted by WPDB::prepare()
     316 *
     317 * @return string
     318 */
     319function get_prepare_placeholders( $number, $format ) {
     320    return implode( ', ', array_fill( 0, $number, $format ) );
    210321}
    211322
  • sites/trunk/api.wordpress.org/public_html/events/1.0/tests/test-index.php

    r5276 r5277  
    305305        ),
    306306
    307         'city-endonym-non-latin-length-greater-than-4-asia' => array(
     307        'city-endonym-ideographic-asia1' => array(
    308308            'input' => array(
    309309                'location_name' => '白浜町宇佐崎南',
     
    319319        ),
    320320
    321         /*
    322          * @todo
    323          *
    324          * This is currently failing, but should pass. It looks like the value is in row 112931.
    325          */
    326         'city-endonym-non-latin-length-greater-than-4-asia2' => array(
     321        'city-endonym-ideographic-asia2' => array(
    327322            'input' => array(
    328323                'location_name' => 'تهران',
     
    338333        ),
    339334
    340         /*
    341          * @todo
    342          *
    343          * This is currently failing, but should pass. It looks like the value is in row 1174872
    344          */
    345         'city-endonym-non-latin-length-greater-than-4-asia3' => array(
     335        'city-endonym-ideographic-asia3' => array(
    346336            'input' => array(
    347337                'location_name' => 'كراچى',
     
    357347        ),
    358348
    359         /*
    360          * @todo
    361          *
    362          * This is currently failing, but it should pass. One reason it may be failing is that `ft_min_word_len`
    363          * is set to `4` and `東京` is only `2`.
    364          *
    365          * But, there are others that are failing that are >= 4, though, like `シラオカ`, `しらおか`, `Ширахама`, and
    366          *`すぎと,スギト`. So, there may be additional reasons too.
    367          */
    368         'city-endonym-non-latin-length-less-than-4-asia' => array(
     349        'city-endonym-ideographic-asia4' => array(
     350            'input' => array(
     351                'location_name' => '京都',
     352                'locale'        => 'ja',
     353                'timezone'      => 'Asia/Tokyo',
     354            ),
     355            'expected' => array(
     356                'description' => 'kyoto',
     357                'latitude'    => '35.021',
     358                'longitude'   => '135.754',
     359                'country'     => 'JP',
     360            ),
     361        ),
     362
     363        'city-endonym-ideographic-asia5' => array(
    369364            'input' => array(
    370365                'location_name' => '東京',
Note: See TracChangeset for help on using the changeset viewer.