_wordpress2jekyll.py - RDoc Documentation

#!/usr/bin/env python3

import sys, re

def re_sub(pattern, replacement, string):

def _r(m):
        # Now this is ugly.
        # Python has a "feature" where unmatched groups return None
        # then re.sub chokes on this.
        # see http://bugs.python.org/issue1519638

        # this works around and hooks into the internal of the re module...

        # the match object is replaced with a wrapper that
        # returns "" instead of None for unmatched groups

        class _m():
                def __init__(self, m):
                        self.m=m
                        self.string=m.string
                def group(self, n):
                        return m.group(n) or ""

        return re._expand(pattern, _m(m), replacement)

return re.sub(pattern, _r, string)

def parseCaptions(content):

"""
[caption id="attachment_76716" align="aligncenter" width="500"]<a href="http://martin-thoma.com/wp-content/uploads/2013/11/WER-calculation.png"><img src="http://martin-thoma.com/wp-content/uploads/2013/11/WER-calculation.png" alt="WER calculation" width="500" height="494" class="size-full wp-image-76716" /></a> WER calculation[/caption]

to

{% caption align="aligncenter" width="500" alt="WER calculation" text="WER calculation" url="../images/2013/11/WER-calculation.png" %}
"""
import re

pattern = '\[caption(.*?)align="(?P<align>.*?)"(.*?)(caption="(?P<caption>.*?)")?(.*?)\]' + \
    '<a(.*?)href=\"(?P<url>.*?)\"(?P<asonst>.*?)>' + \
    '<img(.*?)' + \
        'class=\"(?P<imgclass>.*?)\"\s*' + \
        'src=\"(?P<imgurl>http://martin-thoma.com/wp-content/uploads/(?P<innerurl>.*?))\" ' + \
        'alt=\"(?P<alt>.*?)\"\s*' + \
        '(title=\"(?P<title>.*?)\")?\s*' + \
        'width=\"(?P<width>.*?)\"\s*' + \
        'height=\"(?P<height>.*?)\"\s*' + \
        '(?P<isonst>.*?)/>' + \
    '</a>\s*' + \
    '(?P<text>.*?)\[/caption\]'
pattern = re.compile(pattern)
results = [m.groupdict() for m in pattern.finditer(content)]
for result in results:
    for key, value in result.items():
        print("%s:\t%s" % (key, value))

content = re_sub(pattern, '{% caption align="\g<align>" width="\g<width>" caption="\g<caption>\g<text>" url="../images/\g<innerurl>" alt="\g<alt>" title="\g<title>" height="\g<height>" class="\g<imgclass>" %}', content)

return content

def pageCodeConversion(page):

yaml, content = getYaml(page)

content = content.replace("[latex]", "$")
content = content.replace("[/latex]", "$")

# Syntax highlighter: also '[python collapse="true"]'
# [c] is dangerous. Use it with caution!
for language in ['python', 'cpp', 'text', 'bash', 'java', 'javascript']:
    content = content.replace("["+language+"]", "{% highlight "+language+" %}")
    content = content.replace("[/"+language+"]", "{% endhighlight %}")
content = parseCaptions(content)
return "---" + yaml + "---" + content

def getYaml(content):

tmp, yaml, *content = content.split("---") #here you need python3
content = "---".join(content)
return (yaml, content)

def featuredImage(website, content):

post2image = {}
# Get all featured image urls connected to posts
import urllib2
from bs4 import BeautifulSoup

# Parse Website for images
while website:
    response = urllib2.urlopen(website)
    html = response.read()
    soup = BeautifulSoup(html)
    for entry in soup.find_all("div", "entry"):
        img = entry.find("img")
        if img is not None:
            imgsrc = img['src'].split("uploads/")[1]

            a = entry.find("a", "readmore")
            post = a['href'][len(website):-1]

            post2image[post] = imgsrc

    nav = soup.find("div", "alignleft")
    if nav is not None:
        website = nav.find("a")
        if website is not None:
            website = website['href']
    else:
        website = None
    print(website)

yaml, content = getYaml(content)
hasFeaturedImage = False

for i, line in enumerate(yaml.split("\n")):
    if ":" in line:
        if line.startswith("featured_image"):
            hasFeaturedImage = True
            break
        if line.startswith("comment"):
            commentline = i

if not hasFeaturedImage:
    mdfilename = filename[len("./_posts/2013-11-18-"):-len(".markdown")]
    if mdfilename not in post2image:
        print("w warning: %s might not have a featured image." % mdfilename)
    else:
        print("# success: %s" % mdfilename)
        imgsrc = post2image[mdfilename]
        newsrc = "---" + yamml + "featured_image: "+imgsrc+"\n" + "---" + content
        with open(filename, 'w') as f:
            f.write(newsrc)
else:
    print("x info: %s has already a featured image" % filename)

def changeYaml(content):

tmp, yaml, *contentArray = content.split("---")
newContent = "---\n"
isComment = False
isTagOrCat = False
for line in yaml.split("\n"):
    if line.startswith("layout:") or \
        line.startswith("title:") or \
        line.startswith("author:") or \
        line.startswith("date:") or \
        line.startswith("context:") or \
        line.startswith("featured_image:"):
        newContent += line + "\n"
        isComment = False
        isTagOrCat = False
    elif line.startswith("status:") or \
        line.startswith("published:") or \
        line.startswith("author_login:") or \
        line.startswith("author_email:") or \
        line.startswith("author_url:") or \
        line.startswith("wordpress_id:") or \
        line.startswith("wordpress_url:") or \
        line.startswith("published:"):
        isComment = False
        isTagOrCat = False
        pass#ignore those lines (that means: delete them!)
    elif line.startswith("comments:"):
        isComment = True
    elif line.startswith("tags:"):
        newContent += line + "\n"
        isTagOrCat = True
    elif line.startswith("categories:"):
        newContent += line + "\n"
        isTagOrCat = True
    elif line.startswith("-") and isComment:
        pass
    elif line.startswith("-") and isTagOrCat:
        newContent += line + "\n"
    else:
        print(line)
newContent += "---"
newContent += "---".join(contentArray)
return newContent

def forEveryPost(website, operation, development=True):

from os import listdir
directory = "./_drafts/"
files = sorted(listdir(directory))
for f in files:
    filename = directory+f

    with open(filename) as f:
        content = f.read()
    newContent = operation(content)

    print(filename)
    if not development:
        with open(filename, 'w') as f:
            f.write(newContent)
    else:
        #print(newContent)
        print("#"*80)

if __name__ == “__main__”:

"""
from argparse import ArgumentParser
parser = ArgumentParser()
# Add more options if you like
parser.add_argument("-f", "--file", dest="filename", required="True",
                  help="convert file FILE", metavar="FILE")

args = parser.parse_args()

pageCodeConversion(args.filename)
"""

# improve things
forEveryPost("http://martin-thoma.com/", pageCodeConversion, False)