[lintian] 01/03: L::Check: Check for duplicate words in check_spelling
This is an automated email from the git hooks/post-receive script.
nthykier pushed a commit to branch master
in repository lintian.
commit 2244821278fa06ded9e45bbb362fc983b12a53d2
Author: Niels Thykier <niels@thykier.net>
Date: Sat Apr 9 13:34:34 2016 +0000
L::Check: Check for duplicate words in check_spelling
Signed-off-by: Niels Thykier <niels@thykier.net>
---
checks/binaries.pm | 2 +-
checks/description.desc | 6 ------
checks/description.pm | 20 -----------------
checks/manpages.pm | 2 +-
debian/changelog | 7 ++++++
lib/Lintian/Check.pm | 25 +++++++++++++++++++---
t/tests/description-general/desc | 1 -
t/tests/description-general/tags | 9 ++++----
.../debian/debian/doc-base.doc2 | 2 +-
.../debian/debian/copyright | 4 ++--
.../debian/debian/copyright | 4 ++--
t/tests/source-copyright-undefined/tags | 4 ++--
12 files changed, 42 insertions(+), 44 deletions(-)
diff --git a/checks/binaries.pm b/checks/binaries.pm
index 88537f8..29bbf0d 100644
--- a/checks/binaries.pm
+++ b/checks/binaries.pm
@@ -360,7 +360,7 @@ sub run {
};
my $tag_emitter
= spelling_tag_emitter('spelling-error-in-binary', $file);
- check_spelling($strings, $exceptions, $tag_emitter);
+ check_spelling($strings, $exceptions, $tag_emitter, 0);
# stripped?
if ($fileinfo =~ m,\bnot stripped\b,o) {
diff --git a/checks/description.desc b/checks/description.desc
index 31ba579..4ab1c6e 100644
--- a/checks/description.desc
+++ b/checks/description.desc
@@ -215,12 +215,6 @@ Info: Lintian found a possible capitalization error in the package
false positives for project names used in a context where they should be
lowercase, such as package names or executable names.
-Tag: description-contains-duplicated-word
-Severity: normal
-Certainty: possible
-Info: The description contains a duplicated word. Usually this is a
- mistake, or at least an awkward phrasing.
-
Tag: using-first-person-in-description
Severity: minor
Certainty: possible
diff --git a/checks/description.pm b/checks/description.pm
index badf31f..396a338 100644
--- a/checks/description.pm
+++ b/checks/description.pm
@@ -157,26 +157,6 @@ sub run {
tag 'description-contains-dh-make-perl-template';
}
- # Check for duplicated words. We want to catch "this this."
- # but not "ITU-T T.81", so compare non-whitespace sequences
- # rather than word characters but allow punctuation at the
- # end.
- #
- # We don't want to think ", ," or "a, a" is a duplicated word,
- # so require that a word start and end with a word character.
- #
- # We replace text that is quoted with ' "" '. The assumption
- # is that quoted words are "okay" and blindly removing them
- # causes false positives with text like "'a' or 'b' or 'c'".
- my $stripped = $_;
- $stripped =~ s,(["'])(.*?)(\1), "" ,g;
- while ($stripped
- =~ m%(?:\s|^)((\w(?:\S*\w)?)(\s+(\2))+)(?:[\).,?!:;\s]|\z)%i) {
- my $words = $1;
- $stripped =~ s/\Q$words//;
- tag 'description-contains-duplicated-word', $words;
- }
-
my $first_person = $_;
while ($first_person
=~ m/(?:^|\s)(I|[Mm]y|[Oo]urs?|mine|myself|me|us|[Ww]e)(?:$|\s)/) {
diff --git a/checks/manpages.pm b/checks/manpages.pm
index f8267bb..be7bd45 100644
--- a/checks/manpages.pm
+++ b/checks/manpages.pm
@@ -304,7 +304,7 @@ sub run {
}
# Check for spelling errors if the manpage is English
check_spelling($line, $ginfo->spelling_exceptions,
- $stag_emitter)
+ $stag_emitter, 0)
if ($path =~ m,/man/man\d/,);
}
}
diff --git a/debian/changelog b/debian/changelog
index ac73de0..332a927 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -2,6 +2,9 @@ lintian (2.5.44) UNRELEASED; urgency=medium
XXX: generate tag summary with private/generate-tag-summary
+ * checks/description.{desc,pm}:
+ + [NT] Drop duplicate word tag, which is now covered by the
+ spelling error check.
* checks/fields.pm:
+ [NT] Add default-jdk-headless and openjdk-X-jdk-headless
to the set of known java providers.
@@ -28,6 +31,10 @@ lintian (2.5.44) UNRELEASED; urgency=medium
+ [AB] Declare compliance with Debian Policy 3.9.8. (No other changes
required.)
+ * lib/Lintian/Check.pm:
+ + [JW,NT] Flag a duplicate word as a spelling error. This affects
+ several tags plus spellintian. (Closes: #800476)
+
-- Niels Thykier <niels@thykier.net> Sun, 03 Apr 2016 18:48:59 +0000
lintian (2.5.43) unstable; urgency=medium
diff --git a/lib/Lintian/Check.pm b/lib/Lintian/Check.pm
index 0f2d214..cb2a380 100644
--- a/lib/Lintian/Check.pm
+++ b/lib/Lintian/Check.pm
@@ -260,7 +260,7 @@ Returns the number of spelling mistakes found in TEXT.
my (%CORRECTIONS, @CORRECTIONS_MULTIWORD);
sub check_spelling {
- my ($text, $exceptions, $code_ref) = @_;
+ my ($text, $exceptions, $code_ref, $duplicate_check) = @_;
return 0 unless $text;
if (not $code_ref and $exceptions and ref($exceptions) eq 'CODE') {
$code_ref = $exceptions;
@@ -268,8 +268,9 @@ sub check_spelling {
} else {
$exceptions //= {};
}
+ $duplicate_check //= 1;
- my %seen;
+ my (%seen, %duplicates, $last_word, $quoted);
my $counter = 0;
if (!%CORRECTIONS) {
@@ -293,7 +294,25 @@ sub check_spelling {
strip($text);
for my $word (split(' ', $text)) {
- $word =~ s/[.,;:?!]+$//;
+ my $ends_with_punct = 0;
+ my $q = $word =~ tr/"/"/;
+ # Change quoting on "foo or foo" but not "foo".
+ if ($q & 1) {
+ $quoted = not $quoted;
+ }
+ $ends_with_punct = 1 if $word =~ s/[.,;:?!]+$//;
+
+ if ($duplicate_check and defined($last_word) and $last_word eq $word) {
+ # Avoid flagging words inside quoted text.
+ $code_ref->("$word $word (duplicate word)", $word)
+ if not $quoted and not $duplicates{$word}++;
+ }
+
+ if ($word =~ m/^[A-Za-z]+$/ and not $ends_with_punct) {
+ $last_word = $word;
+ } else {
+ $last_word = undef;
+ }
next if ($word =~ /^[A-Z]{1,5}\z/);
# Some exceptions are based on case (e.g. "teH").
next if exists($exceptions->{$word});
diff --git a/t/tests/description-general/desc b/t/tests/description-general/desc
index 3d8640d..256bd48 100644
--- a/t/tests/description-general/desc
+++ b/t/tests/description-general/desc
@@ -6,7 +6,6 @@ Test-For:
capitalization-error-in-description
capitalization-error-in-description-synopsis
description-contains-dh-make-perl-template
- description-contains-duplicated-word
description-contains-homepage
description-contains-invalid-control-statement
description-contains-tabs
diff --git a/t/tests/description-general/tags b/t/tests/description-general/tags
index 437cbcc..c9b1bb1 100644
--- a/t/tests/description-general/tags
+++ b/t/tests/description-general/tags
@@ -23,8 +23,7 @@ W: description-general-4: spelling-error-in-description mroe more
W: description-general-syn-article: description-synopsis-starts-with-article
W: description-general-syn-spelling: spelling-error-in-description-synopsis developement development
W: description-general: description-contains-dh-make-perl-template
-W: description-general: description-contains-duplicated-word All all all
-W: description-general: description-contains-duplicated-word The the
-W: description-general: description-contains-duplicated-word matched matched matched
-W: description-general: description-contains-duplicated-word of of
-W: description-general: description-contains-duplicated-word these these
+W: description-general: spelling-error-in-description all all (duplicate word) all
+W: description-general: spelling-error-in-description matched matched (duplicate word) matched
+W: description-general: spelling-error-in-description of of (duplicate word) of
+W: description-general: spelling-error-in-description these these (duplicate word) these
diff --git a/t/tests/menus-doc-base-general/debian/debian/doc-base.doc2 b/t/tests/menus-doc-base-general/debian/debian/doc-base.doc2
index ac2e764..6dcc2f5 100644
--- a/t/tests/menus-doc-base-general/debian/debian/doc-base.doc2
+++ b/t/tests/menus-doc-base-general/debian/debian/doc-base.doc2
@@ -4,7 +4,7 @@ Title: Document 2
Author: Lintian maintainers
Author: Duplicate field... oops.
Abstract: Document 2
- Blah blah manage online manuals Debian blah blah
+ Blah bla manage online manuals Debian blah bla
.
The second line is totally a template. Oh yeah, a space too
much on the " ."-line followed by wrong indentation.
diff --git a/t/tests/source-copyright-license-header/debian/debian/copyright b/t/tests/source-copyright-license-header/debian/debian/copyright
index 5085cf1..da6a69d 100644
--- a/t/tests/source-copyright-license-header/debian/debian/copyright
+++ b/t/tests/source-copyright-license-header/debian/debian/copyright
@@ -6,7 +6,7 @@ License: public-domain
some public-domain
License: golf
- blah blah
+ blah bla
This should not trigger a dep5-file-paragraph-reference-header
tag.
@@ -18,7 +18,7 @@ License: public-domain
Files: debian/*
Copyright: 2014, somebody1
-License: this
+License: this-license
this is a valid license short name
Files: debian/compat
diff --git a/t/tests/source-copyright-undefined/debian/debian/copyright b/t/tests/source-copyright-undefined/debian/debian/copyright
index 24c9758..dea5f48 100644
--- a/t/tests/source-copyright-undefined/debian/debian/copyright
+++ b/t/tests/source-copyright-undefined/debian/debian/copyright
@@ -6,7 +6,7 @@ Source: http://examples.com/doohickey/source/
Files: *
Copyright: 2014, somebody1
License: Fixme
- Fixme
+ Fixme-license text
Files: debian/*
@@ -39,7 +39,7 @@ License: -
Files: debian/e
Comment: too many false positive with space
Copyright: 2014, somebody2
-License: undefined undefined
+License: undefined license
Fixme
Files: debian/f
diff --git a/t/tests/source-copyright-undefined/tags b/t/tests/source-copyright-undefined/tags
index 7260b54..2fc4b04 100644
--- a/t/tests/source-copyright-undefined/tags
+++ b/t/tests/source-copyright-undefined/tags
@@ -2,7 +2,7 @@ E: source-copyright-undefined source: license-problem-undefined-license - (parag
E: source-copyright-undefined source: license-problem-undefined-license fixme (paragraph at line 6)
E: source-copyright-undefined source: license-problem-undefined-license todo (paragraph at line 22)
E: source-copyright-undefined source: license-problem-undefined-license undefined (paragraph at line 45)
-E: source-copyright-undefined source: license-problem-undefined-license undefined undefined (paragraph at line 39)
+E: source-copyright-undefined source: license-problem-undefined-license undefined license (paragraph at line 39)
E: source-copyright-undefined source: license-problem-undefined-license unknow (paragraph at line 17)
E: source-copyright-undefined source: license-problem-undefined-license unknown (paragraph at line 12)
I: source-copyright-undefined source: unused-file-paragraph-in-dep5-copyright paragraph at line 17
@@ -23,4 +23,4 @@ I: source-copyright-undefined source: wildcard-matches-nothing-in-dep5-copyright
I: source-copyright-undefined: spelling-error-in-copyright unknow unknown
W: source-copyright-undefined source: dep5-copyright-license-name-not-unique (paragraph at line 33)
W: source-copyright-undefined source: space-in-std-shortname-in-dep5-copyright common public license - v 1.0 (paragraph at line 51)
-W: source-copyright-undefined source: space-in-std-shortname-in-dep5-copyright undefined undefined (paragraph at line 39)
+W: source-copyright-undefined source: space-in-std-shortname-in-dep5-copyright undefined license (paragraph at line 39)
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/lintian/lintian.git
Reply to: