From 31fe67d7ee016579ad3827c44fe7d2bf61f3efbe Mon Sep 17 00:00:00 2001 From: Dave Meikle Date: Mon, 2 Dec 2019 19:03:00 +0000 Subject: [PATCH] TIKA-2630: Wrong height and width metadata for JPEG images (#255) * TIKA-2630: - Added extraction of image height/width from ExifSubIFDDirectory for compressed images - Include directory name as key qualifier for Exif directories to avoid clashes * TIKA-2630: Tidied up code # Conflicts: # tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java --- .../parser/image/ImageMetadataExtractor.java | 23 +++++++++++++++++-- .../tika/parser/jpeg/JpegParserTest.java | 10 ++++---- .../parser/ocr/TesseractOCRParserTest.java | 2 +- .../apache/tika/parser/rtf/RTFParserTest.java | 2 +- 4 files changed, 28 insertions(+), 9 deletions(-) diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java index aeb0223b0b..f6670d81b3 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java @@ -260,7 +260,11 @@ public void handle(Directory directory, Metadata metadata) throws MetadataException { if (directory.getTags() != null) { for (Tag tag : directory.getTags()) { - metadata.set(tag.getTagName(), tag.getDescription()); + if (directory instanceof ExifDirectoryBase) { + metadata.set(directory.getName() + ":" + tag.getTagName(), tag.getDescription()); + } else { + metadata.set(tag.getTagName(), tag.getDescription()); + } } } } @@ -288,7 +292,11 @@ public void handle(Directory directory, Metadata metadata) } else if (Boolean.FALSE.toString().equalsIgnoreCase(value)) { value = Boolean.FALSE.toString(); } - metadata.set(name, value); + if (directory instanceof ExifDirectoryBase) { + metadata.set(directory.getName() + ":" + name, value); + } else { + metadata.set(name, value); + } } } } @@ -493,6 +501,17 @@ public void handlePhotoTags(Directory directory, Metadata metadata) { metadata.set(Metadata.IMAGE_LENGTH, trimPixels(directory.getDescription(ExifThumbnailDirectory.TAG_IMAGE_HEIGHT))); } + + // For Compressed Images read from ExifSubIFDDirectory + if (directory.containsTag(ExifSubIFDDirectory.TAG_EXIF_IMAGE_WIDTH)) { + metadata.set(Metadata.IMAGE_WIDTH, + trimPixels(directory.getDescription(ExifSubIFDDirectory.TAG_EXIF_IMAGE_WIDTH))); + } + if (directory.containsTag(ExifSubIFDDirectory.TAG_EXIF_IMAGE_WIDTH)) { + metadata.set(Metadata.IMAGE_LENGTH, + trimPixels(directory.getDescription(ExifSubIFDDirectory.TAG_EXIF_IMAGE_HEIGHT))); + } + } /** diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java index c710f2390f..a1339d5450 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java @@ -65,8 +65,8 @@ public void testJPEG() throws Exception { parser.parse(stream, new DefaultHandler(), metadata, new ParseContext()); // Core EXIF/TIFF tags - assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH)); - assertEquals("68", metadata.get(Metadata.IMAGE_LENGTH)); + assertEquals("3888", metadata.get(Metadata.IMAGE_WIDTH)); + assertEquals("2592", metadata.get(Metadata.IMAGE_LENGTH)); assertEquals("8", metadata.get(Metadata.BITS_PER_SAMPLE)); assertEquals(null, metadata.get(Metadata.SAMPLES_PER_PIXEL)); @@ -86,7 +86,7 @@ public void testJPEG() throws Exception { // Check that EXIF/TIFF tags come through with their raw values too // (This may be removed for Tika 1.0, as we support more of them // with explicit Metadata entries) - assertEquals("Canon EOS 40D", metadata.get("Model")); + assertEquals("Canon EOS 40D", metadata.get("Exif IFD0:Model")); // Common tags assertEquals("2009-10-02T23:02:49", metadata.get(Metadata.LAST_MODIFIED)); @@ -115,8 +115,8 @@ public void testJPEGGeo() throws Exception { assertEquals("-54.1234", metadata.get(Metadata.LONGITUDE)); // Core EXIF/TIFF tags - assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH)); - assertEquals("68", metadata.get(Metadata.IMAGE_LENGTH)); + assertEquals("3888", metadata.get(Metadata.IMAGE_WIDTH)); + assertEquals("2592", metadata.get(Metadata.IMAGE_LENGTH)); assertEquals("8", metadata.get(Metadata.BITS_PER_SAMPLE)); assertEquals(null, metadata.get(Metadata.SAMPLES_PER_PIXEL)); diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java index 45ef4e26f1..7e3f01c4ec 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java @@ -256,7 +256,7 @@ public void getNormalMetadataToo() throws Exception { m = getXML("testTIFF.tif").metadata; assertEquals("100", m.get(Metadata.IMAGE_WIDTH)); assertEquals("75", m.get(Metadata.IMAGE_LENGTH)); - assertEquals("72 dots per inch", m.get("Y Resolution")); + assertEquals("72 dots per inch", m.get("Exif IFD0:Y Resolution")); } //TODO: add unit tests for jp2/jpx/ppm TIKA-2174 diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java index 27f3b2af37..79c5834d66 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java @@ -466,7 +466,7 @@ public void testRegularImages() throws Exception { assertEquals("false", meta_jpg_exif.get(RTFMetadata.THUMBNAIL)); assertEquals(51, meta_jpg.names().length); - assertEquals(110, meta_jpg_exif.names().length); + assertEquals(112, meta_jpg_exif.names().length); } @Test