-
Notifications
You must be signed in to change notification settings - Fork 4
/
crop-anyocr-binarize-page-olena-sauvola-denoise-ocropy-deskew-page-ocropy-segment-tesseract-ocropy-shrink-extract-regions.mk
117 lines (81 loc) · 3 KB
/
crop-anyocr-binarize-page-olena-sauvola-denoise-ocropy-deskew-page-ocropy-segment-tesseract-ocropy-shrink-extract-regions.mk
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
# Install by copying (or symlinking) makefiles into a directory
# where all OCR-D workspaces (unpacked BagIts) reside. Then
# chdir to that location.
# Call via:
# `make -f WORKFLOW-CONFIG.mk WORKSPACE-DIRS` or
# `make -f WORKFLOW-CONFIG.mk all` or just
# `make -f WORKFLOW-CONFIG.mk`
# To rebuild partially, you must pass -W to recursive make:
# `make -f WORKFLOW-CONFIG.mk EXTRA_MAKEFLAGS="-W FILEGRP"`
# To get help on available goals:
# `make help`
###
# From here on, custom configuration begins.
info:
@echo "Read GT segmentation (on any level, merely for page frame),"
@echo "or if not available, then read image files and binarize+crop,"
@echo "then binarize+denoise+deskew pages,"
@echo "then segment into regions and lines,"
@echo "then shrink regions into the hull polygon of its lines,"
@echo "and finally extract page images and region coordinates"
@echo "(including meta-data) into one directory,"
@echo "with corresponding filename suffixes for segmentation training."
INPUT = OCR-D-IMG
$(INPUT):
ocrd workspace find -G $@ --download
BIN = $(INPUT)-BINPAGE-sauvola
$(BIN): $(INPUT)
$(BIN): TOOL = ocrd-olena-binarize
$(BIN): PARAMS = "impl": "sauvola-ms-split"
CROP = OCR-D-SEG-PAGE-anyocr
$(CROP): $(BIN)
$(CROP): TOOL = ocrd-anybaseocr-crop
# search GT for page segmentation, otherwise use cropped image:
INPUT2 = $(firstword $(foreach GRP,OCR-D-GT-SEG-PAGE OCR-D-GT-SEG-BLOCK OCR-D-GT-SEG-LINE,$(wildcard $(GRP))) $(CROP))
BIN2 = $(INPUT2)-BINPAGE-sauvola
$(BIN2): $(INPUT2)
$(BIN2): TOOL = ocrd-olena-binarize
$(BIN2): PARAMS = "impl": "sauvola-ms-split"
DEN = $(BIN2)-DENOISE-ocropy
$(DEN): $(BIN2)
$(DEN): TOOL = ocrd-cis-ocropy-denoise
$(DEN): PARAMS = "level-of-operation": "page", "noise_maxsize": 3.0
FLIP = $(DEN)-DESKEW-tesseract
$(FLIP): $(DEN)
$(FLIP): TOOL = ocrd-tesserocr-deskew
$(FLIP): PARAMS = "operation_level": "page"
DESK = $(FLIP)-DESKEW-ocropy
$(DESK): $(FLIP)
$(DESK): TOOL = ocrd-cis-ocropy-deskew
$(DESK): PARAMS = "level-of-operation": "page", "maxskew": 5
BLOCK = OCR-D-SEG-BLOCK-tesseract
$(BLOCK): $(DESK)
$(BLOCK): TOOL = ocrd-tesserocr-segment-region
$(BLOCK): PARAMS = "padding": 5, "find_tables": false
PLAUSIBLE = $(BLOCK)-plausible
$(PLAUSIBLE): $(BLOCK)
$(PLAUSIBLE): TOOL = ocrd-segment-repair
$(PLAUSIBLE): PARAMS = "plausibilize": true, "plausibilize_merge_min_overlap": 0.7
CLIP = $(BLOCK)-CLIP
$(CLIP): $(PLAUSIBLE)
$(CLIP): TOOL = ocrd-cis-ocropy-clip
FLIPR = $(CLIP)-DESKEW-tesseract
$(FLIPR): $(CLIP)
$(FLIPR): TOOL = ocrd-tesserocr-deskew
$(FLIPR): PARAMS = "operation_level": "region"
LINE = OCR-D-SEG-LINE-tesseract-ocropy
$(LINE): $(FLIPR)
$(LINE): TOOL = ocrd-cis-ocropy-segment
$(LINE): PARAMS = "spread": 2.4
TIGHT = OCR-D-SEG-BLOCK-tesseract-ocropy
$(TIGHT): $(LINE)
$(TIGHT): TOOL = ocrd-segment-repair
$(TIGHT): PARAMS = "sanitize": true
OUTPUT = OCR-D-IMG-REGIONS
$(OUTPUT): $(TIGHT)
$(OUTPUT): TOOL = ocrd-segment-extract-regions
$(OUTPUT): PARAMS = "transparency": true
.DEFAULT_GOAL = $(OUTPUT)
# Down here, custom configuration ends.
###
include Makefile