Bug#908937: ghostscript breaks ocrmypdf autopkgtest

To: submit@bugs.debian.org
Subject: Bug#908937: ghostscript breaks ocrmypdf autopkgtest
From: Paul Gevers <elbrus@debian.org>
Date: Sun, 16 Sep 2018 11:20:46 +0200
Message-id: <[🔎] 619962ff-4694-9d5b-7bcc-8e24605c6b61@debian.org>
Reply-to: Paul Gevers <elbrus@debian.org>, 908937@bugs.debian.org

Source: ghostscript, ocrmypdf
Control: found -1 ghostscript/9.25~dfsg-2
Control: found -1 ocrmypdf/6.2.3-1
X-Debbugs-CC: debian-ci@lists.debian.org
User: debian-ci@lists.debian.org
Usertags: breaks needs-update

Dear maintainers,

With a recent upload of ghostscript the autopkgtest of ocrmypdf fails in
testing when that autopkgtest is run with the binary packages of
ghostscript from unstable. It passes when run with only packages from
testing. I copied some of the output at the bottom of this report.

As ghostscript is uploaded with urgency high, this regression is NOT
delaying of the migration of ghostscript to testing [1]. If this
regression requires blockage of ghostscript to testing, fast action is
required (raising the severity of this bug should be enough, albeit I
haven't tested RC blockage when bugs are assigned to multiple packages.
Due to the nature of this issue, I filed this bug report against both
packages. Can you please investigate the situation and reassign the bug
to the right package? As needed, please change the bug's severity.

More information about this bug and the reason for filing it can be found on
https://wiki.debian.org/ContinuousIntegration/RegressionEmailInformation

Paul

[1] https://qa.debian.org/excuses.php?package=ghostscript

https://ci.debian.net/data/autopkgtest/testing/amd64/o/ocrmypdf/1000885/log.gz

=================================== FAILURES
===================================
_______________ test_compression_changed[congress.jpg-lossless]
________________

spoof_tesseract_noop = {'ADTTMP':
'/tmp/autopkgtest-lxc.bj5c8t8i/downtmp/autopkgtest_tmp',
'ADT_ARTIFACTS':
'/tmp/autopkgtest-lxc.bj5c8t8i/do...j5c8t8i/downtmp/test-suite-artifacts',
'AUTOPKGTEST_TMP':
'/tmp/autopkgtest-lxc.bj5c8t8i/downtmp/autopkgtest_tmp', ...}
ocrmypdf_exec = ['/usr/bin/python3', '-m', 'ocrmypdf']
resources =
PosixPath('/tmp/autopkgtest-lxc.bj5c8t8i/downtmp/build.24A/src/tests/resources')
image = 'congress.jpg', compression = 'lossless'
outpdf =
'/tmp/pytest-of-debci/pytest-0/test_compression_changed_congr0/out.pdf'

    @pytest.mark.parametrize('image,compression', [
        ('baiona.png', 'jpeg'),
        ('baiona_gray.png', 'lossless'),
        ('congress.jpg', 'lossless')
        ])
    def test_compression_changed(spoof_tesseract_noop, ocrmypdf_exec,
                                 resources, image, compression, outpdf):
        from PIL import Image

        input_file = str(resources / image)
        output_file = str(outpdf)

        im = Image.open(input_file)

        # Runs: ocrmypdf - output.pdf < testfile
        with open(input_file, 'rb') as input_stream:
            p_args = ocrmypdf_exec + [
                '--image-dpi', '150', '--output-type', 'pdfa',
                '--pdfa-image-compression', compression,
                '-', output_file]
            p = Popen(
                p_args, close_fds=True, stdout=PIPE, stderr=PIPE,
                stdin=input_stream, env=spoof_tesseract_noop)
            out, err = p.communicate()

            assert p.returncode == ExitCode.ok, err

        pdfinfo = PdfInfo(output_file)

        pdfimage = pdfinfo[0].images[0]

        if compression == "jpeg":
            assert pdfimage.enc == Encoding.jpeg
        else:
            if ghostscript.jpeg_passthrough_available():
                # Ghostscript 9.23 adds JPEG passthrough, which allows a
JPEG to be
                # copied without transcoding - so report
                if image.endswith('jpg'):
                    assert pdfimage.enc == Encoding.jpeg
            else:
>               assert pdfimage.enc not in (Encoding.jpeg,
Encoding.jpeg2000)
E               AssertionError: assert <Encoding.jpeg: 2> not in
(<Encoding.jpeg: 2>, <Encoding.jpeg2000: 3>)
E                +  where <Encoding.jpeg: 2> = <ImageInfo '/R8' image
1000x1520 Colorspace.rgb 3 8 Encoding.jpeg 150.0x150.0>.enc

tests/test_main.py:917: AssertionError
_________________________ test_preserve_metadata[pdfa]
_________________________

spoof_tesseract_noop = {'ADTTMP':
'/tmp/autopkgtest-lxc.bj5c8t8i/downtmp/autopkgtest_tmp',
'ADT_ARTIFACTS':
'/tmp/autopkgtest-lxc.bj5c8t8i/do...j5c8t8i/downtmp/test-suite-artifacts',
'AUTOPKGTEST_TMP':
'/tmp/autopkgtest-lxc.bj5c8t8i/downtmp/autopkgtest_tmp', ...}
output_type = 'pdfa'
resources =
PosixPath('/tmp/autopkgtest-lxc.bj5c8t8i/downtmp/build.24A/src/tests/resources')
outpdf =
'/tmp/pytest-of-debci/pytest-0/test_preserve_metadata_pdfa_0/out.pdf'

    @pytest.mark.parametrize("output_type", [
        'pdfa', 'pdf'
        ])
    def test_preserve_metadata(spoof_tesseract_noop, output_type,
                               resources, outpdf):
        pdf_before = pypdf.PdfFileReader(str(resources / 'graph.pdf'))

        output = check_ocrmypdf(
                resources / 'graph.pdf', outpdf,
                '--output-type', output_type,
                env=spoof_tesseract_noop)

        pdf_after = pypdf.PdfFileReader(str(output))

        for key in ('/Title', '/Author'):
>           assert pdf_before.documentInfo[key] ==
pdf_after.documentInfo[key]

tests/test_metadata.py:52:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
_ _ _ _

self = {'/Producer': 'GPL Ghostscript 9.25', '/CreationDate':
"D:20150722013418+00'00'", '/ModDate': "D:20180916080247+00'00'"}
key = '/Title'

    def __getitem__(self, key):
>       return dict.__getitem__(self, key).getObject()
E       KeyError: '/Title'

/usr/lib/python3/dist-packages/PyPDF2/generic.py:520: KeyError
----------------------------- Captured stdout call
-----------------------------
   INFO - Output file is a PDF/A-2B (as expected)

_________________________ test_override_metadata[pdfa]
_________________________

spoof_tesseract_noop = {'ADTTMP':
'/tmp/autopkgtest-lxc.bj5c8t8i/downtmp/autopkgtest_tmp',
'ADT_ARTIFACTS':
'/tmp/autopkgtest-lxc.bj5c8t8i/do...j5c8t8i/downtmp/test-suite-artifacts',
'AUTOPKGTEST_TMP':
'/tmp/autopkgtest-lxc.bj5c8t8i/downtmp/autopkgtest_tmp', ...}
output_type = 'pdfa'
resources =
PosixPath('/tmp/autopkgtest-lxc.bj5c8t8i/downtmp/build.24A/src/tests/resources')
outpdf =
'/tmp/pytest-of-debci/pytest-0/test_override_metadata_pdfa_0/out.pdf'

    @pytest.mark.parametrize("output_type", [
        'pdfa', 'pdf'
        ])
    def test_override_metadata(spoof_tesseract_noop, output_type, resources,
                               outpdf):
        input_file = resources / 'c02-22.pdf'
        german = 'Du siehst den Wald vor lauter Bäumen nicht.'
        chinese = '孔子'

        p, out, err = run_ocrmypdf(
            input_file, outpdf,
            '--title', german,
            '--author', chinese,
            '--output-type', output_type,
            env=spoof_tesseract_noop)

        assert p.returncode == ExitCode.ok, err

        before = pypdf.PdfFileReader(str(input_file))
        after = pypdf.PdfFileReader(outpdf)

>       assert after.documentInfo['/Title'] == german

tests/test_metadata.py:79:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
_ _ _ _

self = {'/Producer': 'GPL Ghostscript 9.25', '/CreationDate':
"D:20150722100404+00'00'", '/ModDate': "D:20180916080254+00'00'"}
key = '/Title'

    def __getitem__(self, key):
>       return dict.__getitem__(self, key).getObject()
E       KeyError: '/Title'

Attachment: signature.asc
Description: OpenPGP digital signature

Reply to:

Follow-Ups:
- Processed: ghostscript breaks ocrmypdf autopkgtest
  - From: "Debian Bug Tracking System" <owner@bugs.debian.org>
- Bug#908937: ghostscript breaks ocrmypdf autopkgtest
  - From: Paul Gevers <elbrus@debian.org>
- Bug#908937: ghostscript breaks ocrmypdf autopkgtest
  - From: Sean Whitton <spwhitton@spwhitton.name>
- Processed: Re: Bug#908937: ghostscript breaks ocrmypdf autopkgtest
  - From: "Debian Bug Tracking System" <owner@bugs.debian.org>

Prev by Date: Processed: ghostscript breaks ocrmypdf autopkgtest
Next by Date: Bug#908937: ghostscript breaks ocrmypdf autopkgtest
Previous by thread: Processed: tagging 907663, tagging 891102, tagging 908295, tagging 853710, fixed 905548 in 0.15.0.1+ds-2 ...
Next by thread: Processed: ghostscript breaks ocrmypdf autopkgtest
Index(es):
- Date
- Thread