Detect Metadata About PDF From Javascript
If my JavaScript code is running in a Chrome extension, and the browser has loaded a PDF file, can I detect metadata about the loaded PDF (number of pages, etc.)? Extra challenge:
Solution 1:
After some quick Google-fu I learned that PDFs store metadata in XMP Format (XML). So you can read the raw file data a pick out metadata with some simple regex.
Select a PDF file from your computer to see the demo:
document.getElementById('f').oninput = async function() {
var pdf = this.files[0];
var details = await pdfDetails(pdf);
console.log(details);
};
function pdfDetails(pdfBlob) {
return new Promise(done => {
var reader = new FileReader();
reader.onload = function() {
var raw = reader.result;
var Pages = raw.match(/\/Type[\s]*\/Page[^s]/g).length;
var regex = /<xmp.*?:(.*?)>(.*?)</g;
var meta = [{
Pages
}];
var matches = regex.exec(raw);
while (matches != null) {
matches.shift();
meta.push({
[matches.shift()]: matches.shift()
});
matches = regex.exec(raw);
}
done(meta);
};
reader.readAsBinaryString(pdfBlob);
});
}
<input type=file id=f accept=".pdf">
Solution 2:
A PDF document doesn't list the number of pages in its metadata. Even if you added some custom metadata to track that information it wouldn't be in a standard way PDF readers would be expected to understand.
Post a Comment for "Detect Metadata About PDF From Javascript"