Skip to content

Commit

Permalink
feat: add OCR for image descriptions using tesseract.js (#1433)
Browse files Browse the repository at this point in the history
* feat: add OCR for image descriptions using tesseract.js

* tweak style of alt editor

* remove unnecessary files
  • Loading branch information
nolanlawson committed Aug 26, 2019
1 parent bd2a7ab commit ca9a32d
Show file tree
Hide file tree
Showing 12 changed files with 272 additions and 12 deletions.
3 changes: 2 additions & 1 deletion bin/svgs.js
Expand Up @@ -53,5 +53,6 @@ module.exports = [
{ id: 'fa-refresh', src: 'src/thirdparty/font-awesome-svg-png/white/svg/refresh.svg' },
{ id: 'fa-plus', src: 'src/thirdparty/font-awesome-svg-png/white/svg/plus.svg' },
{ id: 'fa-info-circle', src: 'src/thirdparty/font-awesome-svg-png/white/svg/info-circle.svg' },
{ id: 'fa-crosshairs', src: 'src/thirdparty/font-awesome-svg-png/white/svg/crosshairs.svg' }
{ id: 'fa-crosshairs', src: 'src/thirdparty/font-awesome-svg-png/white/svg/crosshairs.svg' },
{ id: 'fa-magic', src: 'src/thirdparty/font-awesome-svg-png/white/svg/magic.svg' }
]
3 changes: 3 additions & 0 deletions package.json
Expand Up @@ -66,6 +66,7 @@
"express": "^4.17.1",
"file-api": "^0.10.4",
"file-drop-element": "0.2.0",
"file-loader": "^4.2.0",
"form-data": "^2.5.0",
"glob": "^7.1.4",
"indexeddb-getall-shim": "^1.3.5",
Expand Down Expand Up @@ -97,6 +98,8 @@
"svelte-transitions": "^1.2.0",
"svgo": "^1.3.0",
"terser-webpack-plugin": "^1.4.1",
"tesseract.js": "^2.0.0-alpha.13",
"tesseract.js-core": "^2.0.0-beta.10",
"text-encoding": "^0.7.0",
"tiny-queue": "^0.2.1",
"webpack": "^4.39.2",
Expand Down
72 changes: 68 additions & 4 deletions src/routes/_components/dialog/components/MediaAltEditor.html
Expand Up @@ -2,7 +2,7 @@
<textarea
id="the-media-alt-input-{realm}-{index}"
class="media-alt-input"
placeholder="Description"
placeholder="Describe for the visually impaired"
ref:textarea
bind:value=rawText
></textarea>
Expand All @@ -20,6 +20,21 @@
max={mediaAltCharLimit}
style="width: 100%; text-align: right;"
/>
<button class="extract-text-button" type="button"
on:click="onClick()"
disabled={extracting}
>
<SvgIcon href="{extracting ? '#fa-spinner' : '#fa-magic'}"
className="extract-text-svg {extracting ? 'spin' : ''}"
/>
<span>
{#if extracting}
Extracting text…
{:else}
Extract text from image
{/if}
</span>
</button>
</div>
<style>
.media-alt-editor {
Expand All @@ -38,12 +53,35 @@
max-height: 70vh;
}

.extract-text-button {
display: flex;
justify-content: center;
align-items: center;
margin-top: 10px;
}

.extract-text-button span {
margin-left: 15px;
}

:global(.extract-text-svg) {
fill: var(--button-text);
width: 18px;
height: 18px;
}

@media (max-height: 767px) {
.media-alt-input {
max-height: 40vh;
width: 100%;
}
}

@media (min-height: 768px) {
.media-alt-input {
min-width: 250px;
}
}
</style>
<script>
import { requestPostAnimationFrame } from '../../../_utils/requestPostAnimationFrame'
Expand All @@ -57,6 +95,9 @@
import LengthGauge from '../../LengthGauge.html'
import LengthIndicator from '../../LengthIndicator.html'
import { length } from 'stringz'
import { runTesseract } from '../../../_utils/runTesseract'
import SvgIcon from '../../SvgIcon.html'
import { toast } from '../../toast/toast'

const updateRawTextInStore = throttleTimer(requestPostAnimationFrame)

Expand All @@ -72,11 +113,13 @@
store: () => store,
data: () => ({
rawText: '',
mediaAltCharLimit: MEDIA_ALT_CHAR_LIMIT
mediaAltCharLimit: MEDIA_ALT_CHAR_LIMIT,
extracting: false
}),
computed: {
length: ({ rawText }) => length(rawText || ''),
overLimit: ({ mediaAltCharLimit, length }) => length > mediaAltCharLimit
overLimit: ({ mediaAltCharLimit, length }) => length > mediaAltCharLimit,
url: ({ media, index }) => get(media, [index, 'data', 'url'])
},
methods: {
observe,
Expand Down Expand Up @@ -118,11 +161,32 @@
},
measure () {
autosize.update(this.refs.textarea)
},
async onClick () {
this.set({ extracting: true })
try {
const { url } = this.get()
const text = await runTesseract(url)
const { media, index, realm } = this.get()
if (media[index].description !== text) {
media[index].description = text
this.store.setComposeData(realm, { media })
this.store.save()
}
} catch (err) {
console.error(err)
/* no await */ toast.say(
'Unable to extract text. Ensure your instance supports cross-origin resource sharing (CORS) for images.'
)
} finally {
this.set({ extracting: false })
}
}
},
components: {
LengthGauge,
LengthIndicator
LengthIndicator,
SvgIcon
}
}
</script>
4 changes: 4 additions & 0 deletions src/routes/_utils/asyncModules.js
Expand Up @@ -47,3 +47,7 @@ export const importSnackbar = () => import(
export const importComposeBox = () => import(
/* webpackChunkName: 'ComposeBox.html' */ '../_components/compose/ComposeBox.html'
).then(getDefault)

export const importTesseractWorker = () => import(
/* webpackChunkName: 'tesseractWorker' */ '../_utils/tesseractWorker.js'
).then(getDefault)
19 changes: 19 additions & 0 deletions src/routes/_utils/runTesseract.js
@@ -0,0 +1,19 @@
import { importTesseractWorker } from '../_utils/asyncModules'

export async function runTesseract (image) {
const worker = await importTesseractWorker()

// TODO: have to trick tesseract into not creating a blob URL because that would break our CSP
// see https://github.com/naptha/tesseract.js/pull/322
let promise
const OldBlob = window.Blob
window.Blob = null
try {
promise = worker.recognize(image)
} finally {
window.Blob = OldBlob
}
promise.progress(_ => console.log('progress', _))
const res = await promise
return res.text
}
21 changes: 21 additions & 0 deletions src/routes/_utils/tesseractWorker.js
@@ -0,0 +1,21 @@
import workerPath from 'tesseract.js/dist/worker.min.js'
// TODO: we should use .wasm instead of .wasm.js. But currently can't because:
// 1. not supported https://github.com/naptha/tesseract.js/blob/9f1e782/docs/local-installation.md#corepath
// 2. webpack defaultRules issues (fixable with https://github.com/webpack/webpack/issues/8412#issuecomment-445586591)

// We should explore this at a later date.
import corePath from 'tesseract.js-core/tesseract-core.wasm.js'
import { TesseractWorker } from 'tesseract.js'

// tesseract has a bug where broken image URLs will silently fail. We could spawn a new worker
// every time to work around the issue, but then it literally spawns a new web worker for each request,
// which seems excessive. So we just live with the bug for now.
// https://github.com/naptha/tesseract.js/issues/325
const { origin } = location
const tesseractWorker = new TesseractWorker({
workerPath: `${origin}/${workerPath}`,
langPath: `${origin}/`,
corePath: `${origin}/${corePath}`
})

export default tesseractWorker
13 changes: 13 additions & 0 deletions src/service-worker.js
Expand Up @@ -18,11 +18,13 @@ const assets = __assets__
.map(file => file.startsWith('/') ? file : `/${file}`)
.filter(filename => !filename.endsWith('.map'))
.filter(filename => filename !== '/robots.txt')
.filter(filename => !filename.includes('traineddata.gz')) // Tesseract already caches it in IDB

// `shell` is an array of all the files generated by webpack
// also contains '/index.html' for some reason
const webpackAssets = __shell__
.filter(filename => !filename.endsWith('.map')) // don't bother with sourcemaps
.filter(filename => !filename.includes('tesseract-core.wasm')) // cache on-demand

// `routes` is an array of `{ pattern: RegExp }` objects that
// match the pages in your src
Expand Down Expand Up @@ -93,6 +95,17 @@ self.addEventListener('fetch', event => {
if (response) {
return response
}

if (/tesseract-core\.wasm/.test(url.pathname)) {
// cache this on-demand
const response = await fetch(req)
if (response && response.status >= 200 && response.status < 300) {
const clonedResponse = response.clone()
/* no await */ caches.open(WEBPACK_ASSETS).then(cache => cache.put(req, clonedResponse))
}
return response
}

// for routes, serve the /service-worker-index.html file from the most recent
// static cache
if (routes.find(route => route.pattern.test(url.pathname))) {
Expand Down
Binary file added static/eng.traineddata.gz
Binary file not shown.
24 changes: 22 additions & 2 deletions webpack/client.config.js
Expand Up @@ -38,7 +38,24 @@ module.exports = {
{
test: /\/_workers\/blurhash\.js$/,
use: {
loader: 'worker-loader'
loader: 'worker-loader',
options: {
name: 'blurhash.[hash].[name].[ext]'
}
}
},
{
test: [
/tesseract\.js\/dist\/worker\.min\.js$/,
/tesseract\.js\/dist\/worker\.min\.js.map$/,
/tesseract\.js-core\/tesseract-core\.wasm$/,
/tesseract\.js-core\/tesseract-core\.wasm.js$/
],
use: {
loader: 'file-loader',
options: {
name: 'tesseract-asset.[hash].[name].[ext]'
}
}
},
{
Expand Down Expand Up @@ -109,6 +126,9 @@ module.exports = {
]),
devtool: dev ? 'inline-source-map' : 'source-map',
performance: {
hints: dev ? false : 'error' // fail if we exceed the default performance budgets
hints: dev ? false : 'error',
assetFilter: assetFilename => {
return !(/\.map$/.test(assetFilename)) && !/tesseract-asset/.test(assetFilename)
}
}
}
17 changes: 15 additions & 2 deletions webpack/server.config.js
Expand Up @@ -3,9 +3,22 @@ const config = require('sapper/config/webpack.js')
const pkg = require('../package.json')
const { mode, dev, resolve, inlineSvgs } = require('./shared.config')

// modules that the server should ignore, either because they cause errors or warnings
// (because they're only used on the client side)
const NOOP_MODULES = [
'page-lifecycle/dist/lifecycle.mjs',
'../_workers/blurhash',
'tesseract.js/dist/worker.min.js',
'tesseract.js/dist/worker.min.js.map',
'tesseract.js-core/tesseract-core.wasm',
'tesseract.js-core/tesseract-core.wasm.js',
'tesseract.js'
]

const serverResolve = JSON.parse(JSON.stringify(resolve))
serverResolve.alias['page-lifecycle/dist/lifecycle.mjs'] = 'lodash-es/noop' // page lifecycle fails in Node
serverResolve.alias['../_workers/blurhash'] = 'lodash-es/noop' // not used on the server side
NOOP_MODULES.forEach(mod => {
serverResolve.alias[mod] = 'lodash-es/noop'
})

module.exports = {
entry: config.server.entry(),
Expand Down
3 changes: 2 additions & 1 deletion webpack/terser.config.js
@@ -1,11 +1,12 @@
const TerserWebpackPlugin = require('terser-webpack-plugin')

module.exports = () => new TerserWebpackPlugin({
exclude: /tesseract-asset/,
cache: true,
parallel: true,
sourceMap: true,
terserOptions: {
ecma: 6,
ecma: 8,
mangle: true,
compress: {
pure_funcs: ['console.log']
Expand Down

0 comments on commit ca9a32d

Please sign in to comment.