Making WordPress.org


Ignore:
Timestamp:
04/07/2017 05:20:54 AM (8 years ago)
Author:
iandunn
Message:

Events: Fallback to a LIKE query for ideographic languages

MySQL < 5.7.6 doesn't support full-text queries for ideographic languages, like Japanese.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • sites/trunk/api.wordpress.org/public_html/events/1.0/index.php

    r5275 r5277  
    150150    /*
    151151     * Multi-word queries may contain cities, regions, and countries, so try to extract just the city
     152     *
     153     * This won't work for most ideographic languages, because they don't use the space character as a word
     154     * delimiter. That's ok, though, because `guess_ideographic_location_from_geonames()` should cover those
     155     * cases.
    152156     */
    153157    if ( ! $guess && $location_word_count >= 2 ) {
     
    207211    ) );
    208212
     213    if ( ! is_a( $row, 'stdClass' ) && 'ASCII' !== mb_detect_encoding( $location_name ) ) {
     214        $row = guess_ideographic_location_from_geonames( $location_name, $country, $timezone );
     215    }
     216
    209217    return $row;
     218}
     219
     220/**
     221 * Look for the given ideographic location in the Geonames database
     222 *
     223 * This is a fallback for situations where the full-text search in `guess_location_from_geonames()` resulted
     224 * in a false-negative. MySQL < 5.7.6 doesn't support full-text searches on ideographic languages, because
     225 * it cannot determine where the word boundaries are.
     226 *
     227 * See https://dev.mysql.com/doc/refman/5.7/en/fulltext-restrictions.html
     228 *
     229 * @param string $location_name
     230 * @param string $country
     231 * @param string $timezone
     232 *
     233 * @return stdClass|null
     234 */
     235function guess_ideographic_location_from_geonames( $location_name, $country, $timezone ) {
     236    global $wpdb;
     237
     238    $ideographic_countries            = get_ideographic_counties();
     239    $ideographic_country_placeholders = get_prepare_placeholders( count( $ideographic_countries ), '%s' );
     240
     241    /*
     242     * The name is wrapped in commas in order to ensure that we're only matching the exact location, which is
     243     * delimited by commas. Otherwise, there would be false positives in situations where `$location_name`
     244     * appears in other rows, which happens sometimes.
     245     *
     246     * Because this will only match entries that are prefixed _and_ postfixed with a comma, it will never match the
     247     * first and last entries in the column. That's ok, though, because the first entry is always an airport code
     248     * in English, which will be matched by other functions. The last entry is often ideographic, so it'd be nice
     249     * to match it, but this is good enough for now.
     250     */
     251    $escaped_location_name = sprintf( '%%,%s,%%', $wpdb->esc_like( $location_name ) );
     252
     253    /*
     254     * REPLACE() is used because sometimes the `alternatenames` column contains entries where the `asciiname` is
     255     * prefixed to an ideographic name; for example: `,Karachi - كراچى,`
     256     *
     257     * If that prefix is not removed, then the LIKE query will fail in those cases, because
     258     * `$escaped_location_name` is wrapped in commas.
     259     *
     260     * The query is restricted to countries where ideographic languages are common, in order to avoid a full-table
     261     * scan.
     262     */
     263    $query = "
     264        SELECT name, latitude, longitude, country
     265        FROM `geoname`
     266        WHERE
     267            country IN ( $ideographic_country_placeholders ) AND
     268            REPLACE( alternatenames, CONCAT( asciiname, ' - ' ), '' ) LIKE %s
     269        ORDER BY
     270            FIELD( %s, country  ) DESC,
     271            FIELD( %s, timezone ) DESC,
     272            population DESC
     273        LIMIT 1";
     274
     275    $prepared_query = $wpdb->prepare(
     276        $query,
     277        array_merge( $ideographic_countries, array( $escaped_location_name, $country, $timezone ) )
     278    );
     279
     280    return $wpdb->get_row( $prepared_query );
     281}
     282
     283/**
     284 * Get an array of countries where ideographic languages are common
     285 *
     286 * Derived from https://en.wikipedia.org/wiki/List_of_writing_systems#List_of_writing_scripts_by_adoption
     287 *
     288 * @todo Some of these individual countries may be able to be removed, to further narrow the rows that need to be
     289 *       scanned by `guess_ideographic_location_from_geonames()`. Some of the entire categories could possibly be
     290 *       removed too, but let's err on the side of caution for now.
     291 */
     292function get_ideographic_counties() {
     293    $middle_east  = array( 'AE', 'BH', 'CY', 'EG', 'IL', 'IR', 'IQ', 'JO', 'KW', 'LB', 'OM', 'PS', 'QA', 'SA', 'SY', 'TR', 'YE' );
     294    $north_africa = array( 'DZ', 'EH', 'EG', 'LY', 'MA', 'SD', 'SS', 'TN' );
     295
     296    $abjad_countries       = array_merge( $middle_east, $north_africa, array( 'CN', 'IL', 'IN', 'MY', 'PK' ) );
     297    $abugida_countries     = array( 'BD', 'BT', 'ER', 'ET', 'ID', 'IN', 'KH', 'LA', 'LK', 'MV', 'MY', 'MU', 'MM', 'NP', 'PK', 'SG', 'TH' );
     298    $logographic_countries = array( 'CN', 'JP', 'KR', 'MY', 'SG');
     299
     300    $all_ideographic_countries = array_merge( $abjad_countries, $abugida_countries, $logographic_countries );
     301
     302    return array_unique( $all_ideographic_countries );
     303}
     304
     305/**
     306 * Build a string of placeholders to pass to `WPDB::prepare()`
     307 *
     308 * Sometimes it's convenient to be able to generate placeholders for `prepare()` dynamically. For example, when
     309 * looping through a multi-dimensional array where the sub-arrays have distinct counts; or when the total
     310 * number of items is too large to conveniently count by hand.
     311 *
     312 * See https://iandunn.name/2016/03/31/generating-dynamic-placeholders-for-wpdb-prepare/
     313 *
     314 * @param int    $number The number of placeholders needed
     315 * @param string $format An sprintf()-like format accepted by WPDB::prepare()
     316 *
     317 * @return string
     318 */
     319function get_prepare_placeholders( $number, $format ) {
     320    return implode( ', ', array_fill( 0, $number, $format ) );
    210321}
    211322
Note: See TracChangeset for help on using the changeset viewer.