@misc{be84af381f28451baf7493d0f5a206f1,
title = "Fusus: a workflow to transform Arabic classical works in printed form to structured text",
abstract = "# FususThis is a workflow that transforms scanned pages into readable text.The pages come from several printed Arabic books from the past few centuries.The workflow takes care of cleaning, OCR and postprocessing.A user can copy and paste image fragments of specks and symbols that must be removed before doing OCR.The workflow detects column layout and line boundaries.Individual lines will be passed to the OCR engine, which is Kraken using a model trainedon many printed Arabic books.See [model](https://among.github.io/fusus/about/model.html).The result is stored in tab-separated files, with the transcription computed by the OCR step,plus position and confidence info resulting from that same step.The workflow can generate proofing pages that support manually checking the OCR results.# Next stepsOnce we have scanned a significant amount of pages, we'll construct a dataset in[Text-Fabric]()format out of it, with features that preserve positions of the words on the page and their confidence.From there we can implement steps to correct OCR mistakes and to perform intertextuality research betweenthe ground work (the {"}Fusus{"} by Ibn Arabi) and its commentary books.# AuthorsThis is work done by Cornelis van Lit and Dirk Roorda.There is more documentation about sources, the research project, and how to usethis software in the[docs](https://among.github.io/fusus/).",
keywords = "arabic, ocr, workflow, text-processing, image-processing, python, kraken, opencv, text-fabric, digital humanities, wisdom",
author = "Dirk Roorda and {van Lit}, Cornelis",
year = "2020",
month = dec,
day = "7",
doi = "10.5281/zenodo.4309884",
language = "English",
publisher = "Zenodo",
address = "Switzerland",
}