The Geek Ramblings

Blog by Aravind Pedapudi

Jan 19th, 2016

Getting back old posts from Wordpress

I had a previous version of this blog hosted on Wordpress, on a different server. When I moved away from that server, I also wanted to move away from the bulky Wordpress (and php :wink:). But I had backups of old posts in an XML generated by Wordpress, which (for sentimental reasons!) I want to restore.

I have seen sample Wordpress XML backups - they were very verbose, and all it needed was converting into JSON and a bit of pruning. The process seemed easy, I might not have the pictures - but just having the posts was enough - until I hit the most difficult part of it: getting the backups!

The backups I realized later, were on my older laptop which is on the verge of breaking down - the display and the HDD both work intermittently before shutting down for days together. I was lucky to find it well rested, it responded well enough - I had read-only access to the HDD on an emergency console in Arch Linux. The system was slow and there were constant 'Buffer I/O Error's which were fixed by a oooh-shut-up: echo 0 0 0 0 /proc/sys/kernel/printk :grin: Mounted the external disk and got working on copying at a snail's pace. 3 Hours of work, and along with the XML backup, I found a full backup of my old server which I now remember using the CPanel for this. Brilliant! I got the pictures too!

Now the next step is to use xml2js and output all the posts as a json, query on the required posts in mongo and output as .md files (after which they go back to mongo :arrows_clockwise:)

Script (for reference - please forgive the synchronous operations!)

var fs = require('fs'),
    p = new (require('xml2js').Parser)({explicitArray: false, async: false}),
    xml = fs.readFileSync('./posts/thegeekramblings.wordpress.2014-05-13.xml'),
    parsed = {},
    out = fs.openSync('./posts/thegeekramblings.wordpress.2014-05-13.json', 'w');

p.parseString(xml.toString(), function (e, o) {parsed = o;});

for (i in parsed.rss.channel.item) {
    fs.writeSync(out, JSON.stringify(parsed.rss.channel.item[i]) + '\n', 'utf8');
}

fs.close(out);

And after importing the json into mongo - the query to get the final list of posts: (categories - youtube and github - contain posts generated by a plugin to sync playlists and commit histories)

db.old_posts.find({
    'wp:post_type': 'post', 
    'category.$.nicename': {
        $nin: ['youtube', 'github']
    }, 
    'wp:status': 'publish'
})

Then processing over the documents to create Markdown files:

var mongo = require('mongoose').mongo, // Getting the MongoClient instance underneath Mongoose
    fs = require('fs');

mongo.connect('mongodb://localhost:27017/blog', function (e, db) {
    db.collection('old_posts').find({
        'wp:post_type': 'post', 
        'category.$.nicename': {
            $nin: ['youtube', 'github']
        }, 
        'wp:status': 'publish'
    }).toArray(function(e, docs) {
        var lines;

        for(var i in docs) {
            lines = [];

            lines.push('---');
            lines.push('layout: post');
            lines.push('title: ' + docs[i].title);
            lines.push('tags: ' + getTags(docs[i]).join(', '));
            lines.push('createdAt: ' + docs[i]['wp:post_date']);
            lines.push('layout: post');
            lines.push('---');
            lines.push('');

            lines.push(docs[i].title);
            lines.push(Array(docs[i].title.length + 1).join('-'));

            lines.push(cleanData(docs[i]['content:encoded']));

            writeFile({
                name: 'wp-archived-' + docs[i]['wp:post_name'],
                content: lines.join('\n')
            });
        }

    });
});

function writeFile(doc) {
    fs.writeFile('./posts/' + doc.name + '.md', doc.content, function (err) {
        console.log('Wrote file:', doc.name, 'Errors:', err);
    });
}

function getTags(doc) {
    var tags = ['archived-from-wp-backup'];

    if (Object.prototype.toString.call(doc.category) === '[object Object]') {
        if (doc.category.$.nicename !== 'uncategorized') {
            tags.push(doc.category.$.nicename);
        }

        return tags;
    }

    for(var i in doc.category) {
        if (doc.category[i].$.nicename !== 'uncategorized') {
            tags.push(doc.category[i].$.nicename);
        }
    }

    return tags;
}

function cleanData(data) {
    data = data.replace(/http:\/\/blog\.arawind\.com\/wp\-content\/uploads/g, '/static/wordpress-imgs');
    data = data.replace(/\[caption .*?\]/g, '');
    data = data.replace(/\[\/caption\]/g, '');
    data = data.replace(/<pre.*?>/g, '\n```\n');
    data = data.replace(/<\/pre>/g, '\n```\n');
    data = data.replace(/<!--.*?-->/g, '');
    return data;
}

Et voilĂ ! A pretty neat job I must say! :smile: