Skip to content
This repository was archived by the owner on Jun 3, 2020. It is now read-only.
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,6 @@ Some things like custom handling of non standard post types is not fully configu
Known issues
============
* Target file names are some times less than optimal.
* Rewriting of image/attachment links if they are downloaded would be a good feature
* There will probably be issues when migrating non utf-8 encoded wordpress dump files (if they exist).

Other Tools
Expand Down
86 changes: 43 additions & 43 deletions exitwp.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,6 @@ def _start_ns(self, prefix, ns):


def html2fmt(html, target_format):
# html = html.replace("\n\n", '<br/><br/>')
# html = html.replace('<pre lang="xml">', '<pre lang="xml"><![CDATA[')
# html = html.replace('</pre>', ']]></pre>')
if target_format == 'html':
return html
else:
Expand Down Expand Up @@ -116,7 +113,6 @@ def parse_items():

def gi(q, unicode_wrap=True, empty=False):
namespace = ''
tag = ''
if q.find(':') > 0:
namespace, tag = q.split(':', 1)
else:
Expand All @@ -137,17 +133,6 @@ def gi(q, unicode_wrap=True, empty=False):
# body = body.replace(key, body_replace[key])
body = re.sub(key, body_replace[key], body)

img_srcs = []
if body is not None:
try:
soup = BeautifulSoup(body)
img_tags = soup.find_all('img')
for img in img_tags:
img_srcs.append(img['src'])
except:
print 'could not parse html: ' + body
# print img_srcs

excerpt = gi('excerpt:encoded', empty=True)

export_item = {
Expand All @@ -163,8 +148,7 @@ def gi(q, unicode_wrap=True, empty=False):
'comments': gi('wp:comment_status') == u'open',
'taxanomies': export_taxanomies,
'body': body,
'excerpt': excerpt,
'img_srcs': img_srcs
'excerpt': excerpt
}

export_items.append(export_item)
Expand Down Expand Up @@ -202,7 +186,6 @@ def open_file(file):
return f

def get_item_uid(item, date_prefix=False, namespace=''):
result = None
if namespace not in item_uids:
item_uids[namespace] = {}

Expand Down Expand Up @@ -236,18 +219,18 @@ def get_item_path(item, dir=''):
filename_parts = [full_dir, '/']
filename_parts.append(item['uid'])
if item['type'] == 'page':
if (not os.path.exists(''.join(filename_parts))):
if not os.path.exists(''.join(filename_parts)):
os.makedirs(''.join(filename_parts))
filename_parts.append('/index')
filename_parts.append('.')
filename_parts.append(target_format)
return ''.join(filename_parts)

def get_attachment_path(src, dir, dir_prefix='images'):
def get_attachment_file_name(src, uid):
try:
files = attachments[dir]
files = attachments[uid]
except KeyError:
attachments[dir] = files = {}
attachments[uid] = files = {}

try:
filename = files[src]
Expand All @@ -264,25 +247,17 @@ def get_attachment_path(src, dir, dir_prefix='images'):
file_infix = file_infix + 1
files[src] = filename = maybe_filename

target_dir = os.path.normpath(blog_dir + '/' + dir_prefix + '/' + dir)
target_file = os.path.normpath(target_dir + '/' + filename)

if (not os.path.exists(target_dir)):
os.makedirs(target_dir)

# if src not in attachments[dir]:
# print target_name
return target_file
return filename

for i in data['items']:
skip_item = False

for field, value in item_field_filter.iteritems():
if(i[field] == value):
if i[field] == value:
skip_item = True
break

if(skip_item):
if skip_item:
continue

sys.stdout.write('.')
Expand Down Expand Up @@ -324,19 +299,44 @@ def get_attachment_path(src, dir, dir_prefix='images'):
out = open_file(fn)
yaml_header['layout'] = 'page'
elif i['type'] in item_type_filter:
pass
continue
else:
print 'Unknown item type :: ' + i['type']

if download_images:
for img in i['img_srcs']:
try:
urlretrieve(urljoin(data['header']['link'],
img.encode('utf-8')),
get_attachment_path(img, i['uid']))
except:
print '\n unable to download ' + urljoin(
data['header']['link'], img.encode('utf-8'))
if download_images and i['body'] is not None:
try:
soup = BeautifulSoup(i['body'])
img_tags = soup.find_all('img')

image_dir = os.path.join('images', i['uid'])
target_dir = os.path.normpath(os.path.join(blog_dir, image_dir))

if img_tags and not os.path.exists(target_dir):
os.makedirs(target_dir)

for img in img_tags:
try:

attachment_file_name = \
get_attachment_file_name(img['src'], i['uid'])
attachment_file_path = os.path.join(target_dir, attachment_file_name)
attachment_url = "/" + os.path.join(image_dir, attachment_file_name)

urlretrieve(urljoin(data['header']['link'],
img['src'].encode('utf-8')),
attachment_file_path)

# Substitute image link with a path of a downloaded image
img['src'] = attachment_url

except:
print '\n unable to download ' + urljoin(
data['header']['link'], img['src'].encode('utf-8'))

if img_tags:
i['body'] = soup.prettify()
except:
print 'could not parse html: ' + i['body']

if out is not None:
def toyaml(data):
Expand Down