| | 1 | <?php |
| | 2 | |
| | 3 | namespace WordPressdotorg\Markdown; |
| | 4 | |
| | 5 | use WP_CLI; |
| | 6 | use WP_Error; |
| | 7 | use WP_Post; |
| | 8 | use WP_Query; |
| | 9 | use WPCom_GHF_Markdown_Parser; |
| | 10 | |
| | 11 | abstract class Importer { |
| | 12 | /** |
| | 13 | * Meta key to store source in. |
| | 14 | * |
| | 15 | * @var string |
| | 16 | */ |
| | 17 | protected $meta_key = 'wporg_markdown_source'; |
| | 18 | |
| | 19 | /** |
| | 20 | * Meta key to store request ETag in. |
| | 21 | * |
| | 22 | * @var string |
| | 23 | */ |
| | 24 | protected $etag_meta_key = 'wporg_markdown_etag'; |
| | 25 | |
| | 26 | /** |
| | 27 | * Posts per page to query for. |
| | 28 | * |
| | 29 | * This needs to be set at least as high as the number of pages being |
| | 30 | * imported, but should not be unbounded (-1). |
| | 31 | * |
| | 32 | * @var int |
| | 33 | */ |
| | 34 | protected $posts_per_page = 350; |
| | 35 | |
| | 36 | /** |
| | 37 | * Get base URL for all pages. |
| | 38 | * |
| | 39 | * This is used for generating the keys for the existing pages. |
| | 40 | * |
| | 41 | * @see static::get_existing_for_post() |
| | 42 | * |
| | 43 | * @return string Base URL to strip from page permalink. |
| | 44 | */ |
| | 45 | abstract protected function get_base(); |
| | 46 | |
| | 47 | /** |
| | 48 | * Get manifest URL. |
| | 49 | * |
| | 50 | * This URL should point to a JSON file containing the manifest for the |
| | 51 | * site's content. (Typically raw.githubusercontent.com) |
| | 52 | * |
| | 53 | * @return string URL for the manifest file. |
| | 54 | */ |
| | 55 | abstract protected function get_manifest_url(); |
| | 56 | |
| | 57 | /** |
| | 58 | * Get post type for the type being imported. |
| | 59 | * |
| | 60 | * @return string Post type slug to import as. |
| | 61 | */ |
| | 62 | abstract public function get_post_type(); |
| | 63 | |
| | 64 | /** |
| | 65 | * Get existing data for a given post. |
| | 66 | * |
| | 67 | * @param WP_Post $post Post to get existing data for. |
| | 68 | * @return array 2-tuple of array key and data. |
| | 69 | */ |
| | 70 | protected function get_existing_for_post( WP_Post $post ) { |
| | 71 | $key = rtrim( str_replace( $this->get_base(), '', get_permalink( $post->ID ) ), '/' ); |
| | 72 | if ( empty( $key ) ) { |
| | 73 | $key = 'index'; |
| | 74 | } |
| | 75 | |
| | 76 | $data = array( |
| | 77 | 'post_id' => $post->ID, |
| | 78 | ); |
| | 79 | return array( $key, $data ); |
| | 80 | } |
| | 81 | |
| | 82 | /** |
| | 83 | * Import the manifest. |
| | 84 | * |
| | 85 | * Fetches the manifest, parses, and creates pages as needed. |
| | 86 | */ |
| | 87 | public function import_manifest() { |
| | 88 | $response = wp_remote_get( $this->get_manifest_url() ); |
| | 89 | if ( is_wp_error( $response ) ) { |
| | 90 | if ( class_exists( 'WP_CLI' ) ) { |
| | 91 | WP_CLI::error( $response->get_error_message() ); |
| | 92 | } |
| | 93 | return $response; |
| | 94 | } elseif ( 200 !== wp_remote_retrieve_response_code( $response ) ) { |
| | 95 | if ( class_exists( 'WP_CLI' ) ) { |
| | 96 | WP_CLI::error( 'Non-200 from Markdown source' ); |
| | 97 | } |
| | 98 | return new WP_Error( 'invalid-http-code', 'Markdown source returned non-200 http code.' ); |
| | 99 | } |
| | 100 | $manifest = json_decode( wp_remote_retrieve_body( $response ), true ); |
| | 101 | if ( ! $manifest ) { |
| | 102 | if ( class_exists( 'WP_CLI' ) ) { |
| | 103 | WP_CLI::error( 'Invalid manifest' ); |
| | 104 | } |
| | 105 | return new WP_Error( 'invalid-manifest', 'Manifest did not unfurl properly.' );; |
| | 106 | } |
| | 107 | // Fetch all handbook posts for comparison |
| | 108 | $q = new WP_Query( array( |
| | 109 | 'post_type' => $this->get_post_type(), |
| | 110 | 'post_status' => 'publish', |
| | 111 | 'posts_per_page' => $this->posts_per_page, |
| | 112 | ) ); |
| | 113 | $existing = array(); |
| | 114 | foreach ( $q->posts as $post ) { |
| | 115 | list( $key, $data ) = $this->get_existing_for_post( $post ); |
| | 116 | $existing[ $key ] = $data; |
| | 117 | } |
| | 118 | $created = $updated = 0; |
| | 119 | foreach ( $manifest as $key => $doc ) { |
| | 120 | // Already exists, update. |
| | 121 | if ( ! empty( $existing[ $key ] ) ) { |
| | 122 | $existing_id = $existing[ $key ]['post_id']; |
| | 123 | if ( $this->update_post_from_manifest_doc( $existing_id, $doc ) ) { |
| | 124 | $updated++; |
| | 125 | } |
| | 126 | |
| | 127 | continue; |
| | 128 | } |
| | 129 | if ( $this->process_manifest_doc( $doc, $existing, $manifest ) ) { |
| | 130 | $created++; |
| | 131 | } |
| | 132 | } |
| | 133 | if ( class_exists( 'WP_CLI' ) ) { |
| | 134 | WP_CLI::success( "Successfully created {$created} and updated {$updated} handbook pages." ); |
| | 135 | } |
| | 136 | } |
| | 137 | |
| | 138 | /** |
| | 139 | * Process a document from the manifest. |
| | 140 | * |
| | 141 | * @param array $doc Document to process. |
| | 142 | * @param array $existing List of existing posts, will be added to. |
| | 143 | * @param array $manifest Manifest data. |
| | 144 | * @return boolean True if processing succeeded, false otherwise. |
| | 145 | */ |
| | 146 | protected function process_manifest_doc( $doc, &$existing, $manifest ) { |
| | 147 | $post_parent = null; |
| | 148 | if ( ! empty( $doc['parent'] ) ) { |
| | 149 | // Find the parent in the existing set |
| | 150 | if ( empty( $existing[ $doc['parent'] ] ) ) { |
| | 151 | if ( ! $this->process_manifest_doc( $manifest[ $doc['parent'] ], $existing, $manifest ) ) { |
| | 152 | return false; |
| | 153 | } |
| | 154 | } |
| | 155 | if ( ! empty( $existing[ $doc['parent'] ] ) ) { |
| | 156 | $parent = $existing[ $doc['parent'] ]; |
| | 157 | $post_parent = $parent['post_id']; |
| | 158 | } |
| | 159 | } |
| | 160 | $post = $this->create_post_from_manifest_doc( $doc, $post_parent ); |
| | 161 | if ( $post ) { |
| | 162 | list( $key, $data ) = $this->get_existing_for_post( $post ); |
| | 163 | $existing[ $key ] = $data; |
| | 164 | return true; |
| | 165 | } |
| | 166 | return false; |
| | 167 | } |
| | 168 | |
| | 169 | /** |
| | 170 | * Create a new handbook page from the manifest document |
| | 171 | */ |
| | 172 | protected function create_post_from_manifest_doc( $doc, $post_parent = null ) { |
| | 173 | if ( $doc['slug'] === 'index' ) { |
| | 174 | $doc['slug'] = $this->get_post_type(); |
| | 175 | } |
| | 176 | $post_data = array( |
| | 177 | 'post_type' => $this->get_post_type(), |
| | 178 | 'post_status' => 'publish', |
| | 179 | 'post_parent' => $post_parent, |
| | 180 | 'post_title' => wp_slash( $doc['slug'] ), |
| | 181 | 'post_name' => sanitize_title_with_dashes( $doc['slug'] ), |
| | 182 | ); |
| | 183 | if ( isset( $doc['title'] ) ) { |
| | 184 | $doc['post_title'] = sanitize_text_field( wp_slash( $doc['title'] ) ); |
| | 185 | } |
| | 186 | $post_id = wp_insert_post( $post_data ); |
| | 187 | if ( ! $post_id ) { |
| | 188 | return false; |
| | 189 | } |
| | 190 | if ( class_exists( 'WP_CLI' ) ) { |
| | 191 | WP_CLI::log( "Created post {$post_id} for {$doc['slug']}." ); |
| | 192 | } |
| | 193 | update_post_meta( $post_id, $this->meta_key, esc_url_raw( $doc['markdown_source'] ) ); |
| | 194 | return get_post( $post_id ); |
| | 195 | } |
| | 196 | |
| | 197 | /** |
| | 198 | * Update an existing post from the manifest. |
| | 199 | * |
| | 200 | * @param int $post_id Existing post ID. |
| | 201 | * @param array $doc Document details from the manifest. |
| | 202 | * @return boolean True if updated, false otherwise. |
| | 203 | */ |
| | 204 | protected function update_post_from_manifest_doc( $post_id, $doc ) { |
| | 205 | $did_update = update_post_meta( $post_id, $this->meta_key, esc_url_raw( $doc['markdown_source'] ) ); |
| | 206 | return $did_update; |
| | 207 | } |
| | 208 | |
| | 209 | /** |
| | 210 | * Update existing posts from Markdown source. |
| | 211 | * |
| | 212 | * Reparses the Markdown for every page. |
| | 213 | */ |
| | 214 | public function import_all_markdown() { |
| | 215 | $q = new WP_Query( array( |
| | 216 | 'post_type' => $this->get_post_type(), |
| | 217 | 'post_status' => 'publish', |
| | 218 | 'fields' => 'ids', |
| | 219 | 'posts_per_page' => $this->posts_per_page, |
| | 220 | ) ); |
| | 221 | $ids = $q->posts; |
| | 222 | $success = 0; |
| | 223 | foreach( $ids as $id ) { |
| | 224 | $ret = $this->update_post_from_markdown_source( $id ); |
| | 225 | if ( class_exists( 'WP_CLI' ) ) { |
| | 226 | if ( is_wp_error( $ret ) ) { |
| | 227 | WP_CLI::warning( $ret->get_error_message() ); |
| | 228 | } elseif ( false === $ret ) { |
| | 229 | WP_CLI::log( "No updates for {$id}" ); |
| | 230 | $success++; |
| | 231 | } else { |
| | 232 | WP_CLI::log( "Updated {$id} from markdown source" ); |
| | 233 | $success++; |
| | 234 | } |
| | 235 | } |
| | 236 | } |
| | 237 | if ( class_exists( 'WP_CLI' ) ) { |
| | 238 | $total = count( $ids ); |
| | 239 | WP_CLI::success( "Successfully updated {$success} of {$total} pages." ); |
| | 240 | } |
| | 241 | } |
| | 242 | |
| | 243 | /** |
| | 244 | * Update a post from its Markdown source. |
| | 245 | * |
| | 246 | * @param int $post_id Post ID to update. |
| | 247 | * @return boolean|WP_Error True if updated, false if no update needed, error otherwise. |
| | 248 | */ |
| | 249 | protected function update_post_from_markdown_source( $post_id ) { |
| | 250 | $markdown_source = $this->get_markdown_source( $post_id ); |
| | 251 | if ( is_wp_error( $markdown_source ) ) { |
| | 252 | return $markdown_source; |
| | 253 | } |
| | 254 | if ( ! function_exists( 'jetpack_require_lib' ) ) { |
| | 255 | return new WP_Error( 'missing-jetpack-require-lib', 'jetpack_require_lib() is missing on system.' ); |
| | 256 | } |
| | 257 | |
| | 258 | // Transform GitHub repo HTML pages into their raw equivalents |
| | 259 | $markdown_source = preg_replace( '#https?://github\.com/([^/]+/[^/]+)/blob/(.+)#', 'https://raw.githubusercontent.com/$1/$2', $markdown_source ); |
| | 260 | $markdown_source = add_query_arg( 'v', time(), $markdown_source ); |
| | 261 | |
| | 262 | // Grab the stored ETag, and use it to deduplicate. |
| | 263 | $args = array( |
| | 264 | 'headers' => array(), |
| | 265 | ); |
| | 266 | $last_etag = get_post_meta( $post_id, $this->etag_meta_key, true ); |
| | 267 | if ( ! empty( $last_etag ) ) { |
| | 268 | $args['headers']['If-None-Match'] = $last_etag; |
| | 269 | } |
| | 270 | |
| | 271 | $response = wp_remote_get( $markdown_source, $args ); |
| | 272 | if ( is_wp_error( $response ) ) { |
| | 273 | return $response; |
| | 274 | } elseif ( 304 === wp_remote_retrieve_response_code( $response ) ) { |
| | 275 | // No update required! |
| | 276 | return false; |
| | 277 | } elseif ( 200 !== wp_remote_retrieve_response_code( $response ) ) { |
| | 278 | return new WP_Error( 'invalid-http-code', 'Markdown source returned non-200 http code.' ); |
| | 279 | } |
| | 280 | |
| | 281 | $etag = wp_remote_retrieve_header( $response, 'etag' ); |
| | 282 | |
| | 283 | $markdown = wp_remote_retrieve_body( $response ); |
| | 284 | // Strip YAML doc from the header |
| | 285 | $markdown = preg_replace( '#^---(.+)---#Us', '', $markdown ); |
| | 286 | |
| | 287 | $title = null; |
| | 288 | if ( preg_match( '/^#\s(.+)/', $markdown, $matches ) ) { |
| | 289 | $title = $matches[1]; |
| | 290 | $markdown = preg_replace( '/^#\swp\s(.+)/', '', $markdown ); |
| | 291 | } |
| | 292 | $markdown = trim( $markdown ); |
| | 293 | |
| | 294 | // Steal the first sentence as the excerpt |
| | 295 | $excerpt = ''; |
| | 296 | if ( preg_match( '/^(.+)/', $markdown, $matches ) ) { |
| | 297 | $excerpt = $matches[1]; |
| | 298 | $markdown = preg_replace( '/^(.+)/', '', $markdown ); |
| | 299 | } |
| | 300 | |
| | 301 | // Transform to HTML and save the post |
| | 302 | jetpack_require_lib( 'markdown' ); |
| | 303 | $parser = new WPCom_GHF_Markdown_Parser(); |
| | 304 | $parser->preserve_shortcodes = false; |
| | 305 | $html = $parser->transform( $markdown ); |
| | 306 | $post_data = array( |
| | 307 | 'ID' => $post_id, |
| | 308 | 'post_content' => wp_filter_post_kses( wp_slash( $html ) ), |
| | 309 | 'post_excerpt' => sanitize_text_field( wp_slash( $excerpt ) ), |
| | 310 | ); |
| | 311 | if ( ! is_null( $title ) ) { |
| | 312 | $post_data['post_title'] = sanitize_text_field( wp_slash( $title ) ); |
| | 313 | } |
| | 314 | wp_update_post( $post_data ); |
| | 315 | |
| | 316 | // Set ETag for future updates. |
| | 317 | update_post_meta( $post_id, $this->etag_meta_key, wp_slash( $etag ) ); |
| | 318 | |
| | 319 | return true; |
| | 320 | } |
| | 321 | |
| | 322 | /** |
| | 323 | * Retrieve the markdown source URL for a given post. |
| | 324 | */ |
| | 325 | public function get_markdown_source( $post_id ) { |
| | 326 | $markdown_source = get_post_meta( $post_id, $this->meta_key, true ); |
| | 327 | if ( ! $markdown_source ) { |
| | 328 | return new WP_Error( 'missing-markdown-source', 'Markdown source is missing for post.' ); |
| | 329 | } |
| | 330 | |
| | 331 | return $markdown_source; |
| | 332 | } |
| | 333 | } |