diff --git a/README.rst b/README.rst index cdaeb1a..12a1306 100644 --- a/README.rst +++ b/README.rst @@ -66,7 +66,6 @@ Some things like custom handling of non standard post types is not fully configu Known issues ============ * Target file names are some times less than optimal. - * Rewriting of image/attachment links if they are downloaded would be a good feature * There will probably be issues when migrating non utf-8 encoded wordpress dump files (if they exist). Other Tools diff --git a/exitwp.py b/exitwp.py index 0a59010..020f633 100755 --- a/exitwp.py +++ b/exitwp.py @@ -69,9 +69,6 @@ def _start_ns(self, prefix, ns): def html2fmt(html, target_format): - # html = html.replace("\n\n", '

') - # html = html.replace('
', '
', ']]>
') if target_format == 'html': return html else: @@ -116,7 +113,6 @@ def parse_items(): def gi(q, unicode_wrap=True, empty=False): namespace = '' - tag = '' if q.find(':') > 0: namespace, tag = q.split(':', 1) else: @@ -137,17 +133,6 @@ def gi(q, unicode_wrap=True, empty=False): # body = body.replace(key, body_replace[key]) body = re.sub(key, body_replace[key], body) - img_srcs = [] - if body is not None: - try: - soup = BeautifulSoup(body) - img_tags = soup.find_all('img') - for img in img_tags: - img_srcs.append(img['src']) - except: - print 'could not parse html: ' + body - # print img_srcs - excerpt = gi('excerpt:encoded', empty=True) export_item = { @@ -163,8 +148,7 @@ def gi(q, unicode_wrap=True, empty=False): 'comments': gi('wp:comment_status') == u'open', 'taxanomies': export_taxanomies, 'body': body, - 'excerpt': excerpt, - 'img_srcs': img_srcs + 'excerpt': excerpt } export_items.append(export_item) @@ -202,7 +186,6 @@ def open_file(file): return f def get_item_uid(item, date_prefix=False, namespace=''): - result = None if namespace not in item_uids: item_uids[namespace] = {} @@ -236,18 +219,18 @@ def get_item_path(item, dir=''): filename_parts = [full_dir, '/'] filename_parts.append(item['uid']) if item['type'] == 'page': - if (not os.path.exists(''.join(filename_parts))): + if not os.path.exists(''.join(filename_parts)): os.makedirs(''.join(filename_parts)) filename_parts.append('/index') filename_parts.append('.') filename_parts.append(target_format) return ''.join(filename_parts) - def get_attachment_path(src, dir, dir_prefix='images'): + def get_attachment_file_name(src, uid): try: - files = attachments[dir] + files = attachments[uid] except KeyError: - attachments[dir] = files = {} + attachments[uid] = files = {} try: filename = files[src] @@ -264,25 +247,17 @@ def get_attachment_path(src, dir, dir_prefix='images'): file_infix = file_infix + 1 files[src] = filename = maybe_filename - target_dir = os.path.normpath(blog_dir + '/' + dir_prefix + '/' + dir) - target_file = os.path.normpath(target_dir + '/' + filename) - - if (not os.path.exists(target_dir)): - os.makedirs(target_dir) - - # if src not in attachments[dir]: - # print target_name - return target_file + return filename for i in data['items']: skip_item = False for field, value in item_field_filter.iteritems(): - if(i[field] == value): + if i[field] == value: skip_item = True break - if(skip_item): + if skip_item: continue sys.stdout.write('.') @@ -324,19 +299,44 @@ def get_attachment_path(src, dir, dir_prefix='images'): out = open_file(fn) yaml_header['layout'] = 'page' elif i['type'] in item_type_filter: - pass + continue else: print 'Unknown item type :: ' + i['type'] - if download_images: - for img in i['img_srcs']: - try: - urlretrieve(urljoin(data['header']['link'], - img.encode('utf-8')), - get_attachment_path(img, i['uid'])) - except: - print '\n unable to download ' + urljoin( - data['header']['link'], img.encode('utf-8')) + if download_images and i['body'] is not None: + try: + soup = BeautifulSoup(i['body']) + img_tags = soup.find_all('img') + + image_dir = os.path.join('images', i['uid']) + target_dir = os.path.normpath(os.path.join(blog_dir, image_dir)) + + if img_tags and not os.path.exists(target_dir): + os.makedirs(target_dir) + + for img in img_tags: + try: + + attachment_file_name = \ + get_attachment_file_name(img['src'], i['uid']) + attachment_file_path = os.path.join(target_dir, attachment_file_name) + attachment_url = "/" + os.path.join(image_dir, attachment_file_name) + + urlretrieve(urljoin(data['header']['link'], + img['src'].encode('utf-8')), + attachment_file_path) + + # Substitute image link with a path of a downloaded image + img['src'] = attachment_url + + except: + print '\n unable to download ' + urljoin( + data['header']['link'], img['src'].encode('utf-8')) + + if img_tags: + i['body'] = soup.prettify() + except: + print 'could not parse html: ' + i['body'] if out is not None: def toyaml(data):