[Date Prev][Date Next] [Thread Prev][Thread Next] [Date Index] [Thread Index]

[lintian] 01/03: L::Check: Check for duplicate words in check_spelling



This is an automated email from the git hooks/post-receive script.

nthykier pushed a commit to branch master
in repository lintian.

commit 2244821278fa06ded9e45bbb362fc983b12a53d2
Author: Niels Thykier <niels@thykier.net>
Date:   Sat Apr 9 13:34:34 2016 +0000

    L::Check: Check for duplicate words in check_spelling
    
    Signed-off-by: Niels Thykier <niels@thykier.net>
---
 checks/binaries.pm                                 |  2 +-
 checks/description.desc                            |  6 ------
 checks/description.pm                              | 20 -----------------
 checks/manpages.pm                                 |  2 +-
 debian/changelog                                   |  7 ++++++
 lib/Lintian/Check.pm                               | 25 +++++++++++++++++++---
 t/tests/description-general/desc                   |  1 -
 t/tests/description-general/tags                   |  9 ++++----
 .../debian/debian/doc-base.doc2                    |  2 +-
 .../debian/debian/copyright                        |  4 ++--
 .../debian/debian/copyright                        |  4 ++--
 t/tests/source-copyright-undefined/tags            |  4 ++--
 12 files changed, 42 insertions(+), 44 deletions(-)

diff --git a/checks/binaries.pm b/checks/binaries.pm
index 88537f8..29bbf0d 100644
--- a/checks/binaries.pm
+++ b/checks/binaries.pm
@@ -360,7 +360,7 @@ sub run {
         };
         my $tag_emitter
           = spelling_tag_emitter('spelling-error-in-binary', $file);
-        check_spelling($strings, $exceptions, $tag_emitter);
+        check_spelling($strings, $exceptions, $tag_emitter, 0);
 
         # stripped?
         if ($fileinfo =~ m,\bnot stripped\b,o) {
diff --git a/checks/description.desc b/checks/description.desc
index 31ba579..4ab1c6e 100644
--- a/checks/description.desc
+++ b/checks/description.desc
@@ -215,12 +215,6 @@ Info: Lintian found a possible capitalization error in the package
  false positives for project names used in a context where they should be
  lowercase, such as package names or executable names.
 
-Tag: description-contains-duplicated-word
-Severity: normal
-Certainty: possible
-Info: The description contains a duplicated word.  Usually this is a
- mistake, or at least an awkward phrasing.
-
 Tag: using-first-person-in-description
 Severity: minor
 Certainty: possible
diff --git a/checks/description.pm b/checks/description.pm
index badf31f..396a338 100644
--- a/checks/description.pm
+++ b/checks/description.pm
@@ -157,26 +157,6 @@ sub run {
             tag 'description-contains-dh-make-perl-template';
         }
 
-        # Check for duplicated words.  We want to catch "this this."
-        # but not "ITU-T T.81", so compare non-whitespace sequences
-        # rather than word characters but allow punctuation at the
-        # end.
-        #
-        # We don't want to think ", ," or "a, a" is a duplicated word,
-        # so require that a word start and end with a word character.
-        #
-        # We replace text that is quoted with ' "" '.  The assumption
-        # is that quoted words are "okay" and blindly removing them
-        # causes false positives with text like "'a' or 'b' or 'c'".
-        my $stripped = $_;
-        $stripped =~ s,(["'])(.*?)(\1), "" ,g;
-        while ($stripped
-            =~ m%(?:\s|^)((\w(?:\S*\w)?)(\s+(\2))+)(?:[\).,?!:;\s]|\z)%i) {
-            my $words = $1;
-            $stripped =~ s/\Q$words//;
-            tag 'description-contains-duplicated-word', $words;
-        }
-
         my $first_person = $_;
         while ($first_person
             =~ m/(?:^|\s)(I|[Mm]y|[Oo]urs?|mine|myself|me|us|[Ww]e)(?:$|\s)/) {
diff --git a/checks/manpages.pm b/checks/manpages.pm
index f8267bb..be7bd45 100644
--- a/checks/manpages.pm
+++ b/checks/manpages.pm
@@ -304,7 +304,7 @@ sub run {
                 }
                 # Check for spelling errors if the manpage is English
                 check_spelling($line, $ginfo->spelling_exceptions,
-                    $stag_emitter)
+                    $stag_emitter, 0)
                   if ($path =~ m,/man/man\d/,);
             }
         }
diff --git a/debian/changelog b/debian/changelog
index ac73de0..332a927 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -2,6 +2,9 @@ lintian (2.5.44) UNRELEASED; urgency=medium
 
   XXX: generate tag summary with private/generate-tag-summary
 
+  * checks/description.{desc,pm}:
+    + [NT] Drop duplicate word tag, which is now covered by the
+      spelling error check.
   * checks/fields.pm:
     + [NT] Add default-jdk-headless and openjdk-X-jdk-headless
       to the set of known java providers.
@@ -28,6 +31,10 @@ lintian (2.5.44) UNRELEASED; urgency=medium
     + [AB] Declare compliance with Debian Policy 3.9.8. (No other changes
       required.)
 
+  * lib/Lintian/Check.pm:
+    + [JW,NT] Flag a duplicate word as a spelling error.  This affects
+      several tags plus spellintian.  (Closes: #800476)
+
  -- Niels Thykier <niels@thykier.net>  Sun, 03 Apr 2016 18:48:59 +0000
 
 lintian (2.5.43) unstable; urgency=medium
diff --git a/lib/Lintian/Check.pm b/lib/Lintian/Check.pm
index 0f2d214..cb2a380 100644
--- a/lib/Lintian/Check.pm
+++ b/lib/Lintian/Check.pm
@@ -260,7 +260,7 @@ Returns the number of spelling mistakes found in TEXT.
 my (%CORRECTIONS, @CORRECTIONS_MULTIWORD);
 
 sub check_spelling {
-    my ($text, $exceptions, $code_ref) = @_;
+    my ($text, $exceptions, $code_ref, $duplicate_check) = @_;
     return 0 unless $text;
     if (not $code_ref and $exceptions and ref($exceptions) eq 'CODE') {
         $code_ref = $exceptions;
@@ -268,8 +268,9 @@ sub check_spelling {
     } else {
         $exceptions //= {};
     }
+    $duplicate_check //= 1;
 
-    my %seen;
+    my (%seen, %duplicates, $last_word, $quoted);
     my $counter = 0;
 
     if (!%CORRECTIONS) {
@@ -293,7 +294,25 @@ sub check_spelling {
     strip($text);
 
     for my $word (split(' ', $text)) {
-        $word =~ s/[.,;:?!]+$//;
+        my $ends_with_punct = 0;
+        my $q = $word =~ tr/"/"/;
+        # Change quoting on "foo or foo" but not "foo".
+        if ($q & 1) {
+            $quoted = not $quoted;
+        }
+        $ends_with_punct = 1 if $word =~ s/[.,;:?!]+$//;
+
+        if ($duplicate_check and defined($last_word) and $last_word eq $word) {
+            # Avoid flagging words inside quoted text.
+            $code_ref->("$word $word (duplicate word)", $word)
+              if not $quoted and not $duplicates{$word}++;
+        }
+
+        if ($word =~ m/^[A-Za-z]+$/ and not $ends_with_punct) {
+            $last_word = $word;
+        } else {
+            $last_word = undef;
+        }
         next if ($word =~ /^[A-Z]{1,5}\z/);
         # Some exceptions are based on case (e.g. "teH").
         next if exists($exceptions->{$word});
diff --git a/t/tests/description-general/desc b/t/tests/description-general/desc
index 3d8640d..256bd48 100644
--- a/t/tests/description-general/desc
+++ b/t/tests/description-general/desc
@@ -6,7 +6,6 @@ Test-For:
  capitalization-error-in-description
  capitalization-error-in-description-synopsis
  description-contains-dh-make-perl-template
- description-contains-duplicated-word
  description-contains-homepage
  description-contains-invalid-control-statement
  description-contains-tabs
diff --git a/t/tests/description-general/tags b/t/tests/description-general/tags
index 437cbcc..c9b1bb1 100644
--- a/t/tests/description-general/tags
+++ b/t/tests/description-general/tags
@@ -23,8 +23,7 @@ W: description-general-4: spelling-error-in-description mroe more
 W: description-general-syn-article: description-synopsis-starts-with-article
 W: description-general-syn-spelling: spelling-error-in-description-synopsis developement development
 W: description-general: description-contains-dh-make-perl-template
-W: description-general: description-contains-duplicated-word All all all
-W: description-general: description-contains-duplicated-word The the
-W: description-general: description-contains-duplicated-word matched matched matched
-W: description-general: description-contains-duplicated-word of of
-W: description-general: description-contains-duplicated-word these these
+W: description-general: spelling-error-in-description all all (duplicate word) all
+W: description-general: spelling-error-in-description matched matched (duplicate word) matched
+W: description-general: spelling-error-in-description of of (duplicate word) of
+W: description-general: spelling-error-in-description these these (duplicate word) these
diff --git a/t/tests/menus-doc-base-general/debian/debian/doc-base.doc2 b/t/tests/menus-doc-base-general/debian/debian/doc-base.doc2
index ac2e764..6dcc2f5 100644
--- a/t/tests/menus-doc-base-general/debian/debian/doc-base.doc2
+++ b/t/tests/menus-doc-base-general/debian/debian/doc-base.doc2
@@ -4,7 +4,7 @@ Title: Document 2
 Author: Lintian maintainers
 Author: Duplicate field... oops.
 Abstract: Document 2
- Blah blah manage online manuals Debian blah blah
+ Blah bla manage online manuals Debian blah bla
   .
   The second line is totally a template.  Oh yeah, a space too
   much on the " ."-line followed by wrong indentation.
diff --git a/t/tests/source-copyright-license-header/debian/debian/copyright b/t/tests/source-copyright-license-header/debian/debian/copyright
index 5085cf1..da6a69d 100644
--- a/t/tests/source-copyright-license-header/debian/debian/copyright
+++ b/t/tests/source-copyright-license-header/debian/debian/copyright
@@ -6,7 +6,7 @@ License: public-domain
  some public-domain
 
 License: golf
- blah blah
+ blah bla
  This should not trigger a dep5-file-paragraph-reference-header
  tag.
 
@@ -18,7 +18,7 @@ License: public-domain
 
 Files: debian/*
 Copyright: 2014, somebody1
-License: this
+License: this-license
  this is a valid license short name
 
 Files: debian/compat
diff --git a/t/tests/source-copyright-undefined/debian/debian/copyright b/t/tests/source-copyright-undefined/debian/debian/copyright
index 24c9758..dea5f48 100644
--- a/t/tests/source-copyright-undefined/debian/debian/copyright
+++ b/t/tests/source-copyright-undefined/debian/debian/copyright
@@ -6,7 +6,7 @@ Source: http://examples.com/doohickey/source/
 Files: *
 Copyright: 2014, somebody1
 License: Fixme
- Fixme
+ Fixme-license text
 
 
 Files: debian/*
@@ -39,7 +39,7 @@ License: -
 Files: debian/e
 Comment: too many false positive with space
 Copyright: 2014, somebody2
-License: undefined undefined
+License: undefined license
  Fixme
 
 Files: debian/f
diff --git a/t/tests/source-copyright-undefined/tags b/t/tests/source-copyright-undefined/tags
index 7260b54..2fc4b04 100644
--- a/t/tests/source-copyright-undefined/tags
+++ b/t/tests/source-copyright-undefined/tags
@@ -2,7 +2,7 @@ E: source-copyright-undefined source: license-problem-undefined-license - (parag
 E: source-copyright-undefined source: license-problem-undefined-license fixme (paragraph at line 6)
 E: source-copyright-undefined source: license-problem-undefined-license todo (paragraph at line 22)
 E: source-copyright-undefined source: license-problem-undefined-license undefined (paragraph at line 45)
-E: source-copyright-undefined source: license-problem-undefined-license undefined undefined (paragraph at line 39)
+E: source-copyright-undefined source: license-problem-undefined-license undefined license (paragraph at line 39)
 E: source-copyright-undefined source: license-problem-undefined-license unknow (paragraph at line 17)
 E: source-copyright-undefined source: license-problem-undefined-license unknown (paragraph at line 12)
 I: source-copyright-undefined source: unused-file-paragraph-in-dep5-copyright paragraph at line 17
@@ -23,4 +23,4 @@ I: source-copyright-undefined source: wildcard-matches-nothing-in-dep5-copyright
 I: source-copyright-undefined: spelling-error-in-copyright unknow unknown
 W: source-copyright-undefined source: dep5-copyright-license-name-not-unique (paragraph at line 33)
 W: source-copyright-undefined source: space-in-std-shortname-in-dep5-copyright common public license - v 1.0 (paragraph at line 51)
-W: source-copyright-undefined source: space-in-std-shortname-in-dep5-copyright undefined undefined (paragraph at line 39)
+W: source-copyright-undefined source: space-in-std-shortname-in-dep5-copyright undefined license (paragraph at line 39)

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/lintian/lintian.git


Reply to: