Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add OCR for image descriptions using tesseract.js #1433

Merged
merged 3 commits into from Aug 26, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 2 additions & 1 deletion bin/svgs.js
Expand Up @@ -53,5 +53,6 @@ module.exports = [
{ id: 'fa-refresh', src: 'src/thirdparty/font-awesome-svg-png/white/svg/refresh.svg' },
{ id: 'fa-plus', src: 'src/thirdparty/font-awesome-svg-png/white/svg/plus.svg' },
{ id: 'fa-info-circle', src: 'src/thirdparty/font-awesome-svg-png/white/svg/info-circle.svg' },
{ id: 'fa-crosshairs', src: 'src/thirdparty/font-awesome-svg-png/white/svg/crosshairs.svg' }
{ id: 'fa-crosshairs', src: 'src/thirdparty/font-awesome-svg-png/white/svg/crosshairs.svg' },
{ id: 'fa-magic', src: 'src/thirdparty/font-awesome-svg-png/white/svg/magic.svg' }
]
3 changes: 3 additions & 0 deletions package.json
Expand Up @@ -66,6 +66,7 @@
"express": "^4.17.1",
"file-api": "^0.10.4",
"file-drop-element": "0.2.0",
"file-loader": "^4.2.0",
"form-data": "^2.5.0",
"glob": "^7.1.4",
"indexeddb-getall-shim": "^1.3.5",
Expand Down Expand Up @@ -97,6 +98,8 @@
"svelte-transitions": "^1.2.0",
"svgo": "^1.3.0",
"terser-webpack-plugin": "^1.4.1",
"tesseract.js": "^2.0.0-alpha.13",
"tesseract.js-core": "^2.0.0-beta.10",
"text-encoding": "^0.7.0",
"tiny-queue": "^0.2.1",
"webpack": "^4.39.2",
Expand Down
72 changes: 68 additions & 4 deletions src/routes/_components/dialog/components/MediaAltEditor.html
Expand Up @@ -2,7 +2,7 @@
<textarea
id="the-media-alt-input-{realm}-{index}"
class="media-alt-input"
placeholder="Description"
placeholder="Describe for the visually impaired"
ref:textarea
bind:value=rawText
></textarea>
Expand All @@ -20,6 +20,21 @@
max={mediaAltCharLimit}
style="width: 100%; text-align: right;"
/>
<button class="extract-text-button" type="button"
on:click="onClick()"
disabled={extracting}
>
<SvgIcon href="{extracting ? '#fa-spinner' : '#fa-magic'}"
className="extract-text-svg {extracting ? 'spin' : ''}"
/>
<span>
{#if extracting}
Extracting text…
{:else}
Extract text from image
{/if}
</span>
</button>
</div>
<style>
.media-alt-editor {
Expand All @@ -38,12 +53,35 @@
max-height: 70vh;
}

.extract-text-button {
display: flex;
justify-content: center;
align-items: center;
margin-top: 10px;
}

.extract-text-button span {
margin-left: 15px;
}

:global(.extract-text-svg) {
fill: var(--button-text);
width: 18px;
height: 18px;
}

@media (max-height: 767px) {
.media-alt-input {
max-height: 40vh;
width: 100%;
}
}

@media (min-height: 768px) {
.media-alt-input {
min-width: 250px;
}
}
</style>
<script>
import { requestPostAnimationFrame } from '../../../_utils/requestPostAnimationFrame'
Expand All @@ -57,6 +95,9 @@
import LengthGauge from '../../LengthGauge.html'
import LengthIndicator from '../../LengthIndicator.html'
import { length } from 'stringz'
import { runTesseract } from '../../../_utils/runTesseract'
import SvgIcon from '../../SvgIcon.html'
import { toast } from '../../toast/toast'

const updateRawTextInStore = throttleTimer(requestPostAnimationFrame)

Expand All @@ -72,11 +113,13 @@
store: () => store,
data: () => ({
rawText: '',
mediaAltCharLimit: MEDIA_ALT_CHAR_LIMIT
mediaAltCharLimit: MEDIA_ALT_CHAR_LIMIT,
extracting: false
}),
computed: {
length: ({ rawText }) => length(rawText || ''),
overLimit: ({ mediaAltCharLimit, length }) => length > mediaAltCharLimit
overLimit: ({ mediaAltCharLimit, length }) => length > mediaAltCharLimit,
url: ({ media, index }) => get(media, [index, 'data', 'url'])
},
methods: {
observe,
Expand Down Expand Up @@ -118,11 +161,32 @@
},
measure () {
autosize.update(this.refs.textarea)
},
async onClick () {
this.set({ extracting: true })
try {
const { url } = this.get()
const text = await runTesseract(url)
const { media, index, realm } = this.get()
if (media[index].description !== text) {
media[index].description = text
this.store.setComposeData(realm, { media })
this.store.save()
}
} catch (err) {
console.error(err)
/* no await */ toast.say(
'Unable to extract text. Ensure your instance supports cross-origin resource sharing (CORS) for images.'
)
} finally {
this.set({ extracting: false })
}
}
},
components: {
LengthGauge,
LengthIndicator
LengthIndicator,
SvgIcon
}
}
</script>
4 changes: 4 additions & 0 deletions src/routes/_utils/asyncModules.js
Expand Up @@ -47,3 +47,7 @@ export const importSnackbar = () => import(
export const importComposeBox = () => import(
/* webpackChunkName: 'ComposeBox.html' */ '../_components/compose/ComposeBox.html'
).then(getDefault)

export const importTesseractWorker = () => import(
/* webpackChunkName: 'tesseractWorker' */ '../_utils/tesseractWorker.js'
).then(getDefault)
19 changes: 19 additions & 0 deletions src/routes/_utils/runTesseract.js
@@ -0,0 +1,19 @@
import { importTesseractWorker } from '../_utils/asyncModules'

export async function runTesseract (image) {
const worker = await importTesseractWorker()

// TODO: have to trick tesseract into not creating a blob URL because that would break our CSP
// see https://github.com/naptha/tesseract.js/pull/322
let promise
const OldBlob = window.Blob
window.Blob = null
try {
promise = worker.recognize(image)
} finally {
window.Blob = OldBlob
}
promise.progress(_ => console.log('progress', _))
const res = await promise
return res.text
}
21 changes: 21 additions & 0 deletions src/routes/_utils/tesseractWorker.js
@@ -0,0 +1,21 @@
import workerPath from 'tesseract.js/dist/worker.min.js'
// TODO: we should use .wasm instead of .wasm.js. But currently can't because:
// 1. not supported https://github.com/naptha/tesseract.js/blob/9f1e782/docs/local-installation.md#corepath
// 2. webpack defaultRules issues (fixable with https://github.com/webpack/webpack/issues/8412#issuecomment-445586591)

// We should explore this at a later date.
import corePath from 'tesseract.js-core/tesseract-core.wasm.js'
import { TesseractWorker } from 'tesseract.js'

// tesseract has a bug where broken image URLs will silently fail. We could spawn a new worker
// every time to work around the issue, but then it literally spawns a new web worker for each request,
// which seems excessive. So we just live with the bug for now.
// https://github.com/naptha/tesseract.js/issues/325
const { origin } = location
const tesseractWorker = new TesseractWorker({
workerPath: `${origin}/${workerPath}`,
langPath: `${origin}/`,
corePath: `${origin}/${corePath}`
})

export default tesseractWorker
13 changes: 13 additions & 0 deletions src/service-worker.js
Expand Up @@ -18,11 +18,13 @@ const assets = __assets__
.map(file => file.startsWith('/') ? file : `/${file}`)
.filter(filename => !filename.endsWith('.map'))
.filter(filename => filename !== '/robots.txt')
.filter(filename => !filename.includes('traineddata.gz')) // Tesseract already caches it in IDB

// `shell` is an array of all the files generated by webpack
// also contains '/index.html' for some reason
const webpackAssets = __shell__
.filter(filename => !filename.endsWith('.map')) // don't bother with sourcemaps
.filter(filename => !filename.includes('tesseract-core.wasm')) // cache on-demand

// `routes` is an array of `{ pattern: RegExp }` objects that
// match the pages in your src
Expand Down Expand Up @@ -93,6 +95,17 @@ self.addEventListener('fetch', event => {
if (response) {
return response
}

if (/tesseract-core\.wasm/.test(url.pathname)) {
// cache this on-demand
const response = await fetch(req)
if (response && response.status >= 200 && response.status < 300) {
const clonedResponse = response.clone()
/* no await */ caches.open(WEBPACK_ASSETS).then(cache => cache.put(req, clonedResponse))
}
return response
}

// for routes, serve the /service-worker-index.html file from the most recent
// static cache
if (routes.find(route => route.pattern.test(url.pathname))) {
Expand Down
Binary file added static/eng.traineddata.gz
Binary file not shown.
24 changes: 22 additions & 2 deletions webpack/client.config.js
Expand Up @@ -38,7 +38,24 @@ module.exports = {
{
test: /\/_workers\/blurhash\.js$/,
use: {
loader: 'worker-loader'
loader: 'worker-loader',
options: {
name: 'blurhash.[hash].[name].[ext]'
}
}
},
{
test: [
/tesseract\.js\/dist\/worker\.min\.js$/,
/tesseract\.js\/dist\/worker\.min\.js.map$/,
/tesseract\.js-core\/tesseract-core\.wasm$/,
/tesseract\.js-core\/tesseract-core\.wasm.js$/
],
use: {
loader: 'file-loader',
options: {
name: 'tesseract-asset.[hash].[name].[ext]'
}
}
},
{
Expand Down Expand Up @@ -109,6 +126,9 @@ module.exports = {
]),
devtool: dev ? 'inline-source-map' : 'source-map',
performance: {
hints: dev ? false : 'error' // fail if we exceed the default performance budgets
hints: dev ? false : 'error',
assetFilter: assetFilename => {
return !(/\.map$/.test(assetFilename)) && !/tesseract-asset/.test(assetFilename)
}
}
}
17 changes: 15 additions & 2 deletions webpack/server.config.js
Expand Up @@ -3,9 +3,22 @@ const config = require('sapper/config/webpack.js')
const pkg = require('../package.json')
const { mode, dev, resolve, inlineSvgs } = require('./shared.config')

// modules that the server should ignore, either because they cause errors or warnings
// (because they're only used on the client side)
const NOOP_MODULES = [
'page-lifecycle/dist/lifecycle.mjs',
'../_workers/blurhash',
'tesseract.js/dist/worker.min.js',
'tesseract.js/dist/worker.min.js.map',
'tesseract.js-core/tesseract-core.wasm',
'tesseract.js-core/tesseract-core.wasm.js',
'tesseract.js'
]

const serverResolve = JSON.parse(JSON.stringify(resolve))
serverResolve.alias['page-lifecycle/dist/lifecycle.mjs'] = 'lodash-es/noop' // page lifecycle fails in Node
serverResolve.alias['../_workers/blurhash'] = 'lodash-es/noop' // not used on the server side
NOOP_MODULES.forEach(mod => {
serverResolve.alias[mod] = 'lodash-es/noop'
})

module.exports = {
entry: config.server.entry(),
Expand Down
3 changes: 2 additions & 1 deletion webpack/terser.config.js
@@ -1,11 +1,12 @@
const TerserWebpackPlugin = require('terser-webpack-plugin')

module.exports = () => new TerserWebpackPlugin({
exclude: /tesseract-asset/,
cache: true,
parallel: true,
sourceMap: true,
terserOptions: {
ecma: 6,
ecma: 8,
mangle: true,
compress: {
pure_funcs: ['console.log']
Expand Down