After exporting pages from Confluence, the next step was to convert between the Confluence wiki format and the MediaWiki format. The differences are sometimes quite amusing - in a link with a different name from the page it is linking to, one puts the name first and the page second, and the other the page first and the name second.
Not really my best code - I think I was writing Python in a very PHPesque way. But it functioned sufficiently to convert our pages.
#!/usr/bin/env python from cStringIO import StringIO import os.path import codecs import re def append_page(all_files, name, filenames): for f in filenames: all_files.append('%s' % (f,)) link_re = re.compile("\[([^\]]+)\]") linktext_re = re.compile("\[([^\|\]]+)\|([^\]]+)\]") bold_re = re.compile("\*([^ *]+)\*") italic_re = re.compile(r"\b_([^ _]+)_\b") def mangle(contents): utf8_w = codecs.getwriter('utf-8') utf8_r = codecs.getreader('utf-8') output = utf8_w(StringIO()) noformat_count = 0 for line in contents.split("\n"): if "h1." in line: line = line.replace("h1.", "==") line = line + " ==" if "h2." in line: line = line.replace("h2.", "===") line = line + " ===" if "h3." in line: line = line.replace("h3.", "====") line = line + " ====" if "h4." in line: line = line.replace("h4.", "=====") line = line + " =====" ltm = linktext_re.search(line) if ltm: if 'http:' in ltm.group(2) or 'ftp:' in ltm.group(2): line = re.sub(linktext_re, r"[\2 \1]", line) else: line = re.sub(linktext_re, r"[\2|\1]", line) if (noformat_count % 2) == 0: lm = link_re.search(line) if lm: line = re.sub(link_re, r"[[\1]]", line) bm = bold_re.search(line) if bm: line = re.sub(bold_re, r"'''\1'''", line) im = italic_re.search(line) if im: line = re.sub(italic_re, r"''\1''", line) while '{noformat}' in line: if (noformat_count % 2) == 0: line = line.replace('{noformat}', '<pre>') else: line = line.replace('{noformat}', '</pre>') noformat_count += 1 output.write(line + "\n") value = output.getvalue() a = utf8_r(StringIO(value)).read() return a files = [] os.path.walk('orig-pages', append_page, files) print files for f in files: contents = codecs.open('orig-pages/%s' % (f,), 'r', 'utf-8').read() contents = mangle(contents) codecs.open('conv-pages/%s' % (f,), 'w', 'utf-8').write(contents)
2 Responses
Nicholas Riley — July 21, 2006 at 08:00 PM.
Neil Blakey-Milner — July 21, 2006 at 08:20 PM.
Have your say