Skip to content

Commit

Permalink
Merge pull request #263 from axa-group/feature/better-error-trace
Browse files Browse the repository at this point in the history
Feature/better error trace
  • Loading branch information
jvalls-axa authored Dec 16, 2019
2 parents e79073d + 0f1ed1c commit c8d4305
Show file tree
Hide file tree
Showing 25 changed files with 368 additions and 644 deletions.
61 changes: 61 additions & 0 deletions api/server/src/api.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import * as fs from 'fs';
import multer from 'multer';
import * as os from 'os';
import * as path from 'path';
import dependencies from './dependencies.json';
import { FileManager } from './FileManager';
import logger from './Logger';
import { ProcessManager } from './ProcessManager';
Expand Down Expand Up @@ -94,11 +95,71 @@ export class ApiServer {
v1_0.get('/modules', this.handleGetModules.bind(this));
v1_0.get('/module-config/:modulename', this.handleGetModuleConfig.bind(this));

v1_0.get('/check-installation', this.handleCheckInstallation.bind(this));

app.listen(port, () => {
logger.info(`Api listening on port ${port}!`);
});
}

private handleCheckInstallation(req: Request, res: Response): void {
const response = `
<style>
table,
th,
td {
text-align: left;
border: 1px solid black;
}
.found {
background: lightgreen;
}
.not.found {
background: red;
}
</style>
<table>
<tr>
<th>Dependency name</th>
<th>Found?</th>
<th>Required?</th>
<th>Path</th>
</tr>
`;
const whereIs = os.platform() === 'win32' ? 'where' : 'which';
const result = dependencies.required.concat(dependencies.optional)
.map((group: any) =>
(group as string[]).map(name => {
const { status, stdout } = spawnSync(whereIs, [`${name}`]);
return {
name,
found: status === 0,
path: status === 0 ? stdout.toString() : '',
required: dependencies.required.includes(group),
};
}).find(g => g.found) || {
name: group[0],
found: false,
path: '',
required: dependencies.required.includes(group),
},
);

res.type('html').send(
response.concat(
result.map(r =>
`<tr>
<td>${r.name}</td>
<td class="${r.found ? 'found' : 'not found'}">${r.found ? 'YES' : 'NO'}</td>
<td>${r.required ? 'YES' : 'NO'}</td>
<td>${r.path || '-'}</td>
</tr>`,
).join(''),
'</table>',
),
).end();
}

/**
* Status: 200 - Ok. Returns the default config of the server
* Status: 404 - Not Found - the default server config was not found in the pre-set location
Expand Down
13 changes: 13 additions & 0 deletions api/server/src/dependencies.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{
"required": [
["magick convert", "convert"],
["magick identify", "identify"],
["dumppdf.py", "dumppdf"],
["pdf2txt.py", "pdf2txt"],
["python3", "python"],
["qpdf"],
["camelot"],
["tesseract"]
],
"optional": [["mutool"], ["pandoc"]]
}
2 changes: 1 addition & 1 deletion server/src/Cleaner.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
*/

import { HeaderFooterDetectionModule } from './processing/HeaderFooterDetectionModule/HeaderFooterDetectionModule';
import { HeadingDetectionDTModule } from './processing/HeadingDetectionDTModule/HeadingDetectionDTModule';
import { HeadingDetectionDTModule } from './processing/HeadingDetectionDtModule/HeadingDetectionDtModule';
import { HeadingDetectionModule } from './processing/HeadingDetectionModule/HeadingDetectionModule';
import { HierarchyDetectionModule } from './processing/HierarchyDetectionModule/HierarchyDetectionModule';
import { KeyValueDetectionModule } from './processing/KeyValueDetectionModule/KeyValueDetectionModule';
Expand Down
2 changes: 1 addition & 1 deletion server/src/input/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ Each module may or may not contain a set of configurable parameters, which (alon

## The Modules

1. [Pdf2Json](pdf2json/README.md)
1. [PDF.js](pdf.js/README.md)
2. [Pdfminer](pdfminer/README.md)
3. [Tesseract](tesseract/README.md)
4. [Google Vision](google-vision/README.md)
Expand Down
60 changes: 0 additions & 60 deletions server/src/input/extract-fonts.ts

This file was deleted.

43 changes: 20 additions & 23 deletions server/src/input/extractImagesFonts.ts
Original file line number Diff line number Diff line change
Expand Up @@ -25,30 +25,27 @@ import logger from '../utils/Logger';
*/
export function extractImagesAndFonts(pdfInputFile: string): Promise<void> {
return new Promise<void>((resolve, reject) => {
const mutoolPath = utils.getCommandLocationOnSystem('mutool');
if (!mutoolPath) {
logger.warn('MuPDF not installed. Will not treats images inside documents...');
resolve();
} else {
const folder = utils.getMutoolExtractionFolder();
logger.info(`Extracting images and fonts to ${folder} using command 'mutool extract ${pdfInputFile}'...`);
const ret = utils.spawnSync('mutool', ['extract', pdfInputFile], { cwd: folder });

if (ret.status !== 0) {
logger.error(ret.stderr.toString());
reject(ret.stderr.toString());
}

const ttfRegExp = /^[A-Z]{6}\+(.*)\-[0-9]+\.ttf$/;
fs.readdirSync(folder).forEach(file => {
const match = file.match(ttfRegExp);

if (match) {
fs.renameSync(`${folder}/${file}`, `${folder}/${match[1]}` + '.ttf');
const folder = utils.getMutoolExtractionFolder();
logger.info(`Extracting images and fonts to ${folder}`);
utils.CommandExecuter.run(utils.CommandExecuter.COMMANDS.MUTOOL, ['extract', pdfInputFile], { cwd: folder })
.then(() => {
const ttfRegExp = /^[A-Z]{6}\+(.*)\-[0-9]+\.ttf$/;
fs.readdirSync(folder).forEach(file => {
const match = file.match(ttfRegExp);
if (match) {
fs.renameSync(`${folder}/${file}`, `${folder}/${match[1]}` + '.ttf');
}
});
resolve();
})
.catch(({ found, error }) => {
logger.warn(error);
if (!found) {
logger.warn('MuPDF not installed. Will not treats images inside documents...');
resolve();
} else {
reject(error);
}
});

resolve();
}
});
}
1 change: 1 addition & 0 deletions server/src/input/pdf.js/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# PDF.js Input Module
52 changes: 1 addition & 51 deletions server/src/input/pdf.js/pdfjs.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
* limitations under the License.
*/

import * as fs from 'fs';
import * as limit from 'limit-async';
import * as pdfjs from 'pdfjs-dist';
import {
Expand Down Expand Up @@ -50,7 +49,7 @@ export function execute(pdfInputFile: string): Promise<Document> {
const startTime: number = Date.now();

return new Promise<Document>((resolveDocument, rejectDocument) => {
return repairPdf(pdfInputFile).then((repairedPdf: string) => {
return utils.repairPdf(pdfInputFile).then((repairedPdf: string) => {
const pages: Array<Promise<Page>> = [];
try {
return (pdfjs.getDocument(repairedPdf) as any).promise.then(doc => {
Expand Down Expand Up @@ -154,52 +153,3 @@ async function loadPage(document: any, pageNum: number): Promise<Page> {
new BoundingBox(0, 0, viewport.width, viewport.height),
);
}

/**
* Repair a pdf using the external qpdf and mutool utilities.
* Use qpdf to decrcrypt the pdf to avoid errors due to DRMs.
* @param filePath The absolute filename and path of the pdf file to be repaired.
*/
function repairPdf(filePath: string) {
const qpdfPath = utils.getCommandLocationOnSystem('qpdf');
let qpdfOutputFile = utils.getTemporaryFile('.pdf');
if (qpdfPath) {
const process = utils.spawnSync('qpdf', ['--decrypt', filePath, qpdfOutputFile]);

if (process.status === 0) {
logger.info(
`qpdf repair successfully performed on file ${filePath}. New file at: ${qpdfOutputFile}`,
);
} else {
logger.warn(
'qpdf error for file ${filePath}:',
process.status,
process.stdout.toString(),
process.stderr.toString(),
);
qpdfOutputFile = filePath;
}
} else {
logger.warn(`qpdf not found on the system. Not repairing the PDF...`);
qpdfOutputFile = filePath;
}

return new Promise<string>(resolve => {
const mutoolPath = utils.getCommandLocationOnSystem('mutool');
if (!mutoolPath) {
logger.warn('MuPDF not installed !! Skip clean PDF.');
resolve(qpdfOutputFile);
} else {
const mupdfOutputFile = utils.getTemporaryFile('.pdf');
const pdfFixer = utils.spawn('mutool', ['clean', qpdfOutputFile, mupdfOutputFile]);
pdfFixer.on('close', () => {
// Check that the file is correctly written on the file system
fs.fsyncSync(fs.openSync(qpdfOutputFile, 'r+'));
logger.info(
`mupdf cleaning successfully performed on file ${qpdfOutputFile}. Resulting file: ${mupdfOutputFile}`,
);
resolve(mupdfOutputFile);
});
}
});
}
Loading

0 comments on commit c8d4305

Please sign in to comment.