Changeset 5277
- Timestamp:
- 04/07/2017 05:20:54 AM (8 years ago)
- Location:
- sites/trunk/api.wordpress.org/public_html/events/1.0
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
sites/trunk/api.wordpress.org/public_html/events/1.0/index.php
r5275 r5277 150 150 /* 151 151 * Multi-word queries may contain cities, regions, and countries, so try to extract just the city 152 * 153 * This won't work for most ideographic languages, because they don't use the space character as a word 154 * delimiter. That's ok, though, because `guess_ideographic_location_from_geonames()` should cover those 155 * cases. 152 156 */ 153 157 if ( ! $guess && $location_word_count >= 2 ) { … … 207 211 ) ); 208 212 213 if ( ! is_a( $row, 'stdClass' ) && 'ASCII' !== mb_detect_encoding( $location_name ) ) { 214 $row = guess_ideographic_location_from_geonames( $location_name, $country, $timezone ); 215 } 216 209 217 return $row; 218 } 219 220 /** 221 * Look for the given ideographic location in the Geonames database 222 * 223 * This is a fallback for situations where the full-text search in `guess_location_from_geonames()` resulted 224 * in a false-negative. MySQL < 5.7.6 doesn't support full-text searches on ideographic languages, because 225 * it cannot determine where the word boundaries are. 226 * 227 * See https://dev.mysql.com/doc/refman/5.7/en/fulltext-restrictions.html 228 * 229 * @param string $location_name 230 * @param string $country 231 * @param string $timezone 232 * 233 * @return stdClass|null 234 */ 235 function guess_ideographic_location_from_geonames( $location_name, $country, $timezone ) { 236 global $wpdb; 237 238 $ideographic_countries = get_ideographic_counties(); 239 $ideographic_country_placeholders = get_prepare_placeholders( count( $ideographic_countries ), '%s' ); 240 241 /* 242 * The name is wrapped in commas in order to ensure that we're only matching the exact location, which is 243 * delimited by commas. Otherwise, there would be false positives in situations where `$location_name` 244 * appears in other rows, which happens sometimes. 245 * 246 * Because this will only match entries that are prefixed _and_ postfixed with a comma, it will never match the 247 * first and last entries in the column. That's ok, though, because the first entry is always an airport code 248 * in English, which will be matched by other functions. The last entry is often ideographic, so it'd be nice 249 * to match it, but this is good enough for now. 250 */ 251 $escaped_location_name = sprintf( '%%,%s,%%', $wpdb->esc_like( $location_name ) ); 252 253 /* 254 * REPLACE() is used because sometimes the `alternatenames` column contains entries where the `asciiname` is 255 * prefixed to an ideographic name; for example: `,Karachi - كراچى,` 256 * 257 * If that prefix is not removed, then the LIKE query will fail in those cases, because 258 * `$escaped_location_name` is wrapped in commas. 259 * 260 * The query is restricted to countries where ideographic languages are common, in order to avoid a full-table 261 * scan. 262 */ 263 $query = " 264 SELECT name, latitude, longitude, country 265 FROM `geoname` 266 WHERE 267 country IN ( $ideographic_country_placeholders ) AND 268 REPLACE( alternatenames, CONCAT( asciiname, ' - ' ), '' ) LIKE %s 269 ORDER BY 270 FIELD( %s, country ) DESC, 271 FIELD( %s, timezone ) DESC, 272 population DESC 273 LIMIT 1"; 274 275 $prepared_query = $wpdb->prepare( 276 $query, 277 array_merge( $ideographic_countries, array( $escaped_location_name, $country, $timezone ) ) 278 ); 279 280 return $wpdb->get_row( $prepared_query ); 281 } 282 283 /** 284 * Get an array of countries where ideographic languages are common 285 * 286 * Derived from https://en.wikipedia.org/wiki/List_of_writing_systems#List_of_writing_scripts_by_adoption 287 * 288 * @todo Some of these individual countries may be able to be removed, to further narrow the rows that need to be 289 * scanned by `guess_ideographic_location_from_geonames()`. Some of the entire categories could possibly be 290 * removed too, but let's err on the side of caution for now. 291 */ 292 function get_ideographic_counties() { 293 $middle_east = array( 'AE', 'BH', 'CY', 'EG', 'IL', 'IR', 'IQ', 'JO', 'KW', 'LB', 'OM', 'PS', 'QA', 'SA', 'SY', 'TR', 'YE' ); 294 $north_africa = array( 'DZ', 'EH', 'EG', 'LY', 'MA', 'SD', 'SS', 'TN' ); 295 296 $abjad_countries = array_merge( $middle_east, $north_africa, array( 'CN', 'IL', 'IN', 'MY', 'PK' ) ); 297 $abugida_countries = array( 'BD', 'BT', 'ER', 'ET', 'ID', 'IN', 'KH', 'LA', 'LK', 'MV', 'MY', 'MU', 'MM', 'NP', 'PK', 'SG', 'TH' ); 298 $logographic_countries = array( 'CN', 'JP', 'KR', 'MY', 'SG'); 299 300 $all_ideographic_countries = array_merge( $abjad_countries, $abugida_countries, $logographic_countries ); 301 302 return array_unique( $all_ideographic_countries ); 303 } 304 305 /** 306 * Build a string of placeholders to pass to `WPDB::prepare()` 307 * 308 * Sometimes it's convenient to be able to generate placeholders for `prepare()` dynamically. For example, when 309 * looping through a multi-dimensional array where the sub-arrays have distinct counts; or when the total 310 * number of items is too large to conveniently count by hand. 311 * 312 * See https://iandunn.name/2016/03/31/generating-dynamic-placeholders-for-wpdb-prepare/ 313 * 314 * @param int $number The number of placeholders needed 315 * @param string $format An sprintf()-like format accepted by WPDB::prepare() 316 * 317 * @return string 318 */ 319 function get_prepare_placeholders( $number, $format ) { 320 return implode( ', ', array_fill( 0, $number, $format ) ); 210 321 } 211 322 -
sites/trunk/api.wordpress.org/public_html/events/1.0/tests/test-index.php
r5276 r5277 305 305 ), 306 306 307 'city-endonym- non-latin-length-greater-than-4-asia' => array(307 'city-endonym-ideographic-asia1' => array( 308 308 'input' => array( 309 309 'location_name' => '白浜町宇佐崎南', … … 319 319 ), 320 320 321 /* 322 * @todo 323 * 324 * This is currently failing, but should pass. It looks like the value is in row 112931. 325 */ 326 'city-endonym-non-latin-length-greater-than-4-asia2' => array( 321 'city-endonym-ideographic-asia2' => array( 327 322 'input' => array( 328 323 'location_name' => 'تهران', … … 338 333 ), 339 334 340 /* 341 * @todo 342 * 343 * This is currently failing, but should pass. It looks like the value is in row 1174872 344 */ 345 'city-endonym-non-latin-length-greater-than-4-asia3' => array( 335 'city-endonym-ideographic-asia3' => array( 346 336 'input' => array( 347 337 'location_name' => 'كراچى', … … 357 347 ), 358 348 359 /* 360 * @todo 361 * 362 * This is currently failing, but it should pass. One reason it may be failing is that `ft_min_word_len` 363 * is set to `4` and `東京` is only `2`. 364 * 365 * But, there are others that are failing that are >= 4, though, like `シラオカ`, `しらおか`, `Ширахама`, and 366 *`すぎと,スギト`. So, there may be additional reasons too. 367 */ 368 'city-endonym-non-latin-length-less-than-4-asia' => array( 349 'city-endonym-ideographic-asia4' => array( 350 'input' => array( 351 'location_name' => '京都', 352 'locale' => 'ja', 353 'timezone' => 'Asia/Tokyo', 354 ), 355 'expected' => array( 356 'description' => 'kyoto', 357 'latitude' => '35.021', 358 'longitude' => '135.754', 359 'country' => 'JP', 360 ), 361 ), 362 363 'city-endonym-ideographic-asia5' => array( 369 364 'input' => array( 370 365 'location_name' => '東京',
Note: See TracChangeset
for help on using the changeset viewer.