Changeset 5543
- Timestamp:
- 06/06/2017 05:39:24 PM (8 years ago)
- Location:
- sites/trunk/api.wordpress.org/public_html/events/1.0
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
sites/trunk/api.wordpress.org/public_html/events/1.0/index.php
r5541 r5543 187 187 function guess_location_from_city( $location_name, $timezone, $country_code ) { 188 188 $guess = guess_location_from_geonames( $location_name, $timezone, $country_code ); 189 $location_word_count = str_word_count( $location_name ); 190 $location_name_parts = explode( ' ', $location_name ); 189 190 if ( $guess ) { 191 return $guess; 192 } 191 193 192 194 /* … … 194 196 * 195 197 * This won't work for most ideographic languages, because they don't use the space character as a word 196 * delimiter. That's ok, though, because `guess_ideographic_location_from_geonames()` should cover those 197 * cases. 198 * delimiter. 198 199 */ 200 $location_name_parts = preg_split( '/\s+/u', $location_name ); 201 $location_word_count = count( $location_name_parts ); 202 199 203 if ( ! $guess && $location_word_count >= 2 ) { 200 204 // Catch input like "Portland Maine" 201 $guess = guess_location_from_geonames( $location_name_parts[0], $timezone, $country_code );205 $guess = guess_location_from_geonames( $location_name_parts[0], $timezone, $country_code, $wildcard = false ); 202 206 } 203 207 … … 205 209 // Catch input like "Sao Paulo Brazil" 206 210 $city_name = sprintf( '%s %s', $location_name_parts[0], $location_name_parts[1] ); 207 $guess = guess_location_from_geonames( $city_name, $timezone, $country_code ); 208 } 209 210 // Normalize all errors to boolean false for consistency 211 if ( empty ( $guess ) ) { 212 $guess = false; 211 $guess = guess_location_from_geonames( $city_name, $timezone, $country_code, $wildcard = false ); 213 212 } 214 213 … … 225 224 * @return stdClass|null 226 225 */ 227 function guess_location_from_geonames( $location_name, $timezone, $country ) {226 function guess_location_from_geonames( $location_name, $timezone, $country, $wildcard = true ) { 228 227 global $wpdb; 229 228 // Look for a location that matches the name. … … 231 230 // And we sort by population desc, assuming that the biggest matching location is the most likely one. 232 231 233 // Strip all quotes from the search query, and then enclose it in double quotes, to force an exact literal search 234 $quoted_location_name = sprintf( 235 '"%s"', 236 strtr( $location_name, [ '"' => '', "'" => '' ] ) 237 ); 238 232 // Exact match 239 233 $row = $wpdb->get_row( $wpdb->prepare( " 240 234 SELECT name, latitude, longitude, country 241 FROM geoname 242 WHERE 243 MATCH( name, asciiname, alternatenames ) 244 AGAINST( %s IN BOOLEAN MODE ) 235 FROM geoname_summary 236 WHERE name = %s 245 237 ORDER BY 246 238 FIELD( %s, country ) DESC, … … 248 240 population DESC 249 241 LIMIT 1", 250 $ quoted_location_name,242 $location_name, 251 243 $country, 252 244 $timezone 253 245 ) ); 254 246 255 if ( ! is_a( $row, 'stdClass' ) && 'ASCII' !== mb_detect_encoding( $location_name ) ) { 256 $row = guess_location_from_geonames_fallback( $location_name, $country, $timezone, 'exact', 'ideographic' ); 257 } 247 // Wildcard match 248 if ( ! $row && $wildcard && 'ASCII' !== mb_detect_encoding( $location_name ) ) { 249 $row = $wpdb->get_row( $wpdb->prepare( " 250 SELECT name, latitude, longitude, country 251 FROM geoname_summary 252 WHERE name LIKE %s 253 ORDER BY 254 FIELD( %s, country ) DESC, 255 FIELD( %s, timezone ) DESC, 256 population DESC 257 LIMIT 1", 258 $location_name . '%', 259 $country, 260 $timezone 261 ) ); 262 } 263 264 // Suffix the "State", good in some countries (western countries) horrible in others 265 // (where geonames data is not as complete, or region names are similar (but not quite the same) to city names) 266 // LEFT JOIN admin1codes ac ON gs.statecode = ac.code 267 // if ( $row->state && $row->state != $row->name && $row->name NOT CONTAINED WITHIN $row->state? ) { 268 // $row->name .= ', ' . $row->state; 269 // } 258 270 259 271 return $row; 260 272 } 261 273 262 /**263 * Look for the given location in the Geonames database using a LIKE query264 *265 * This is a fallback for situations where the full-text search in `guess_location_from_geonames()` resulted266 * in a false-negative.267 *268 * One situation where this happens is with queries in ideographic languages, because MySQL < 5.7.6 doesn't269 * support full-text searches for them, because it can't determine where the word boundaries are.270 * See https://dev.mysql.com/doc/refman/5.7/en/fulltext-restrictions.html271 *272 * There are also edge cases where the exact query doesn't exist in the database, but a loose LIKE query will find273 * a similar alternate, like `Osakashi`.274 *275 * @param string $location_name276 * @param string $country277 * @param string $timezone278 * @param string $mode 'exact' to only return exact matches from the database;279 * 'loose' to return any match. This has a high chance of false positives.280 * @param string $restrict_counties 'ideographic' to only search in countries where ideographic languages are common;281 * 'none' to search all countries282 *283 * @return stdClass|null284 */285 function guess_location_from_geonames_fallback( $location_name, $country, $timezone, $mode = 'exact', $restrict_counties = 'ideographic' ) {286 global $wpdb;287 288 $where = $ideographic_countries = $ideographic_country_placeholders = '';289 290 /*291 * The name is wrapped in commas in order to ensure that we're only matching the exact location, which is292 * delimited by commas. Otherwise, there would be false positives in situations where `$location_name`293 * appears in other rows, which happens sometimes.294 *295 * Because this will only match entries that are prefixed _and_ postfixed with a comma, it will never match the296 * first and last entries in the column. That's ok, though, because the first entry is often an airport code297 * in English, which is shorter than `ft_min_word_len` anyway. The last entry is often ideographic, so it'd be nice298 * to match it, but this is good enough for now.299 */300 $escaped_location_name = sprintf(301 'loose' === $mode ? '%%%s%%' : '%%,%s,%%',302 $wpdb->esc_like( $location_name )303 );304 305 $prepare_args = array( $escaped_location_name, $country, $timezone );306 307 if ( 'ideographic' == $restrict_counties ) {308 $ideographic_countries = get_ideographic_counties();309 $ideographic_country_placeholders = get_prepare_placeholders( count( $ideographic_countries ), '%s' );310 311 $where .= "country IN ( $ideographic_country_placeholders ) AND";312 313 $prepare_args = array_merge( $ideographic_countries, $prepare_args );314 }315 316 /*317 * REPLACE() is used because sometimes the `alternatenames` column contains entries where the `asciiname` is318 * prefixed to an ideographic name; for example: `,Karachi - كراچى,`319 *320 * If that prefix is not removed, then the LIKE query will fail in those cases, because321 * `$escaped_location_name` is wrapped in commas.322 *323 * The query is restricted to countries where ideographic languages are common, in order to avoid a full-table324 * scan.325 */326 $query = "327 SELECT name, latitude, longitude, country328 FROM `geoname`329 WHERE330 $where331 REPLACE( alternatenames, CONCAT( asciiname, ' - ' ), '' ) LIKE %s332 ORDER BY333 FIELD( %s, country ) DESC,334 FIELD( %s, timezone ) DESC,335 population DESC336 LIMIT 1";337 338 $prepared_query = $wpdb->prepare( $query, $prepare_args );339 340 return $wpdb->get_row( $prepared_query );341 }342 343 /**344 * Get an array of countries where ideographic languages are common345 *346 * Derived from https://en.wikipedia.org/wiki/List_of_writing_systems#List_of_writing_scripts_by_adoption347 *348 * @todo Some of these individual countries may be able to be removed, to further narrow the rows that need to be349 * scanned by `guess_ideographic_location_from_geonames()`. Some of the entire categories could possibly be350 * removed too, but let's err on the side of caution for now.351 */352 function get_ideographic_counties() {353 $middle_east = array( 'AE', 'BH', 'CY', 'EG', 'IL', 'IR', 'IQ', 'JO', 'KW', 'LB', 'OM', 'PS', 'QA', 'SA', 'SY', 'TR', 'YE' );354 $north_africa = array( 'DZ', 'EH', 'EG', 'LY', 'MA', 'SD', 'SS', 'TN' );355 356 $abjad_countries = array_merge( $middle_east, $north_africa, array( 'CN', 'IL', 'IN', 'MY', 'PK' ) );357 $abugida_countries = array( 'BD', 'BT', 'ER', 'ET', 'ID', 'IN', 'KH', 'LA', 'LK', 'MV', 'MY', 'MU', 'MM', 'NP', 'PK', 'SG', 'TH' );358 $logographic_countries = array( 'CN', 'JP', 'KR', 'MY', 'SG');359 360 $all_ideographic_countries = array_merge( $abjad_countries, $abugida_countries, $logographic_countries );361 362 return array_unique( $all_ideographic_countries );363 }364 365 /**366 * Build a string of placeholders to pass to `WPDB::prepare()`367 *368 * Sometimes it's convenient to be able to generate placeholders for `prepare()` dynamically. For example, when369 * looping through a multi-dimensional array where the sub-arrays have distinct counts; or when the total370 * number of items is too large to conveniently count by hand.371 *372 * See https://iandunn.name/2016/03/31/generating-dynamic-placeholders-for-wpdb-prepare/373 *374 * @param int $number The number of placeholders needed375 * @param string $format An sprintf()-like format accepted by WPDB::prepare()376 *377 * @return string378 */379 function get_prepare_placeholders( $number, $format ) {380 return implode( ', ', array_fill( 0, $number, $format ) );381 }382 274 383 275 /** … … 580 472 ); 581 473 } 582 }583 }584 585 /*586 * If all else fails, cast a wide net and try to find something before giving up, even587 * if the chance of success if lower than normal. Returning false is guaranteed failure, so this improves things588 * even if it only works 10% of the time.589 *590 * This must be done as the very last thing before giving up, because the likelihood of false positives is high.591 */592 if ( ! $location && isset( $args['location_name'] ) ) {593 if ( 'ASCII' === mb_detect_encoding( $args['location_name'] ) ) {594 $guess = guess_location_from_geonames_fallback( $args['location_name'], $country_code, $args['timezone'] ?? '', 'loose', 'none' );595 } else {596 $guess = guess_location_from_geonames_fallback( $args['location_name'], $country_code, $args['timezone'] ?? '', 'loose', 'ideographic' );597 }598 599 if ( $guess ) {600 $location = array(601 'description' => $guess->name,602 'latitude' => $guess->latitude,603 'longitude' => $guess->longitude,604 'country' => $guess->country,605 );606 474 } 607 475 } -
sites/trunk/api.wordpress.org/public_html/events/1.0/tests/test-index.php
r5501 r5543 154 154 ), 155 155 156 /*157 * This is matching a city inside the country before it the country searches run, but that's ok since it's158 * good enough for our use cases159 */160 156 'country-exonym-2-words' => array( 161 157 'input' => array( … … 165 161 ), 166 162 'expected' => array( 167 'description' => 'pale', 168 'latitude' => '43.817', 169 'longitude' => '18.569', 170 'country' => 'BA' 163 'country' => 'BA', 164 'description' => 'bosnia and herzegovina', 171 165 ), 172 166 ), … … 288 282 ), 289 283 'expected' => array( 290 'description' => 's ão paulo',284 'description' => 'sao paulo', 291 285 'latitude' => '-23.548', 292 286 'longitude' => '-46.636', … … 303 297 ), 304 298 'expected' => array( 305 'description' => 'osaka ',299 'description' => 'osakashi', 306 300 'latitude' => '34.694', 307 301 'longitude' => '135.502', … … 319 313 ), 320 314 'expected' => array( 321 'description' => 'osaka ',315 'description' => 'osakashi', 322 316 'latitude' => '34.694', 323 317 'longitude' => '135.502', … … 366 360 ), 367 361 'expected' => array( 368 'description' => "do ña ana",362 'description' => "dona ana", 369 363 'latitude' => '32.390', 370 364 'longitude' => '-106.814', … … 394 388 ), 395 389 'expected' => array( 396 'description' => "st .louis",390 'description' => "st louis", 397 391 'latitude' => '38.627', 398 392 'longitude' => '-90.198', … … 429 423 ), 430 424 'expected' => array( 431 'description' => ' addis ababa',425 'description' => 'አዲስ አበባ', 432 426 'latitude' => '9.025', 433 427 'longitude' => '38.747', … … 443 437 ), 444 438 'expected' => array( 445 'description' => ' shirahamachō-usazakiminami',439 'description' => '白浜町宇佐崎南', 446 440 'latitude' => '34.783', 447 441 'longitude' => '134.717', … … 457 451 ), 458 452 'expected' => array( 459 'description' => ' tehran',453 'description' => 'تهران', 460 454 'latitude' => '35.694', 461 455 'longitude' => '51.422', … … 471 465 ), 472 466 'expected' => array( 473 'description' => ' karachi',467 'description' => 'كراچى', 474 468 'latitude' => '24.906', 475 469 'longitude' => '67.082', … … 485 479 ), 486 480 'expected' => array( 487 'description' => ' kyoto',481 'description' => '京都', 488 482 'latitude' => '35.021', 489 483 'longitude' => '135.754', … … 499 493 ), 500 494 'expected' => array( 501 'description' => ' tokyo',495 'description' => '東京', 502 496 'latitude' => '35.690', 503 497 'longitude' => '139.692', … … 514 508 ), 515 509 'expected' => array( 516 'description' => ' osaka',510 'description' => '大阪市', 517 511 'latitude' => '34.694', 518 512 'longitude' => '135.502', … … 528 522 ), 529 523 'expected' => array( 530 'description' => ' vienna',524 'description' => 'wien', 531 525 'latitude' => '48.208', 532 526 'longitude' => '16.372', … … 542 536 ), 543 537 'expected' => array( 544 'description' => ' moscow',538 'description' => 'Москва', 545 539 'latitude' => '55.752', 546 540 'longitude' => '37.616', … … 556 550 ), 557 551 'expected' => array( 558 'description' => ' mexico city',552 'description' => 'ciudad de méxico', 559 553 'latitude' => '19.428', 560 554 'longitude' => '-99.128',
Note: See TracChangeset
for help on using the changeset viewer.