Getting back old posts from Wordpress
I had a previous version of this blog hosted on Wordpress, on a different server. When I moved away from that server, I also wanted to move away from the bulky Wordpress (and php ). But I had backups of old posts in an XML generated by Wordpress, which (for sentimental reasons!) I want to restore.
I have seen sample Wordpress XML backups - they were very verbose, and all it needed was converting into JSON and a bit of pruning. The process seemed easy, I might not have the pictures - but just having the posts was enough - until I hit the most difficult part of it: getting the backups!
The backups I realized later, were on my older laptop which is on the verge of breaking down - the display and the HDD both work intermittently before shutting down for days together. I was lucky to find it well rested, it responded well enough - I had read-only access to the HDD on an emergency console in Arch Linux. The system was slow and there were constant 'Buffer I/O Error's which were fixed by a oooh-shut-up: echo 0 0 0 0 /proc/sys/kernel/printk
Mounted the external disk and got working on copying at a snail's pace. 3 Hours of work, and along with the XML backup, I found a full backup of my old server which I now remember using the CPanel for this. Brilliant! I got the pictures too!
Now the next step is to use xml2js and output all the posts as a json, query on the required posts in mongo and output as .md files (after which they go back to mongo )
Script (for reference - please forgive the synchronous operations!)
var fs = require('fs'),
p = new (require('xml2js').Parser)({explicitArray: false, async: false}),
xml = fs.readFileSync('./posts/thegeekramblings.wordpress.2014-05-13.xml'),
parsed = {},
out = fs.openSync('./posts/thegeekramblings.wordpress.2014-05-13.json', 'w');
p.parseString(xml.toString(), function (e, o) {parsed = o;});
for (i in parsed.rss.channel.item) {
fs.writeSync(out, JSON.stringify(parsed.rss.channel.item[i]) + '\n', 'utf8');
}
fs.close(out);
And after importing the json into mongo - the query to get the final list of posts: (categories - youtube and github - contain posts generated by a plugin to sync playlists and commit histories)
db.old_posts.find({
'wp:post_type': 'post',
'category.$.nicename': {
$nin: ['youtube', 'github']
},
'wp:status': 'publish'
})
Then processing over the documents to create Markdown files:
var mongo = require('mongoose').mongo, // Getting the MongoClient instance underneath Mongoose
fs = require('fs');
mongo.connect('mongodb://localhost:27017/blog', function (e, db) {
db.collection('old_posts').find({
'wp:post_type': 'post',
'category.$.nicename': {
$nin: ['youtube', 'github']
},
'wp:status': 'publish'
}).toArray(function(e, docs) {
var lines;
for(var i in docs) {
lines = [];
lines.push('---');
lines.push('layout: post');
lines.push('title: ' + docs[i].title);
lines.push('tags: ' + getTags(docs[i]).join(', '));
lines.push('createdAt: ' + docs[i]['wp:post_date']);
lines.push('layout: post');
lines.push('---');
lines.push('');
lines.push(docs[i].title);
lines.push(Array(docs[i].title.length + 1).join('-'));
lines.push(cleanData(docs[i]['content:encoded']));
writeFile({
name: 'wp-archived-' + docs[i]['wp:post_name'],
content: lines.join('\n')
});
}
});
});
function writeFile(doc) {
fs.writeFile('./posts/' + doc.name + '.md', doc.content, function (err) {
console.log('Wrote file:', doc.name, 'Errors:', err);
});
}
function getTags(doc) {
var tags = ['archived-from-wp-backup'];
if (Object.prototype.toString.call(doc.category) === '[object Object]') {
if (doc.category.$.nicename !== 'uncategorized') {
tags.push(doc.category.$.nicename);
}
return tags;
}
for(var i in doc.category) {
if (doc.category[i].$.nicename !== 'uncategorized') {
tags.push(doc.category[i].$.nicename);
}
}
return tags;
}
function cleanData(data) {
data = data.replace(/http:\/\/blog\.arawind\.com\/wp\-content\/uploads/g, '/static/wordpress-imgs');
data = data.replace(/\[caption .*?\]/g, '');
data = data.replace(/\[\/caption\]/g, '');
data = data.replace(/<pre.*?>/g, '\n```\n');
data = data.replace(/<\/pre>/g, '\n```\n');
data = data.replace(/<!--.*?-->/g, '');
return data;
}
Et voilĂ ! A pretty neat job I must say!