summaryrefslogtreecommitdiff
path: root/old-gsv-eval.sh
blob: a8cdbd191304f6615f67263b42d88bf1d2d3baa2 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
#!/bin/sh

#    Copyright (C) 2014 Assaf Gordon <assafgordon@gmail.com>
#
#    This file is free software: you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation, either version 3 of the License, or
#    (at your option) any later version.
#
#    This file is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with this file. If not, see <http://www.gnu.org/licenses/>.
#
#    Written by Assaf Gordon

# This script scans the file in the current directory,
# trying to detect possible non-compliance issues for porject submitted to
# GNU Savannah.
# The general guidelines are here:
#    http://savannah.gnu.org/maintenance/HowToGetYourProjectApprovedQuickly/

found_errors=0

##
## Write the files to be checked to STDOUT.
## If this is a code-repository directory (git/hg),
##   only scan the checked-in files, not generated files.
## If this is a regular directory (implying perhaps, a tarball),
##   scan all files.
##
## NOTES:
## 1. git/hg might report directories, so they are (inefficiently) filtered out.
## 2. Add SVN/CVS/BZR support.
##
list_files()
{
  if test -d .git ; then
    git ls-files
  elif test -d .hg ; then
    hg status -c | cut -d" " -f2
  else
    find . -type f | sed 's;^\./;;'
  fi | while read F ; do
    test -f "$F" && echo "$F"
  done
}

## Reads input from STDIN (implied but not critical: list of files, one per line)
## and prints files/lines which contain problematic characteres.
## The most common offenders: whitespace characters (or NULs), and quotes.
##
## While it is not a strict requirements to avoid them,
## such files will bring a world of pain to anyone dealing with this project.
##
## The script could theoritcally be fortified against this, by
## using 'find -print0', 'xargs -0', 'grep -z',  etc.
find_bad_filenames()
{
    LC_ALL=C grep -v -E '^[A-Za-z0-9\.\=\_\/\+\@\-]+$'
}

## Read a list of files from STDIN,
## and write to STDOUT files that are detected as text files.
## NOTES:
## 1. 'grep .' ensures the exit code is 0 if any files are printed.
## 2. 'file -i' is not standard but common enough to be considered portable.
filter_text_files()
{
    xargs file -i \
        | awk -F': *' '$2 ~ /^text\// {print $1}' \
        | grep .
}

## Read a list of files from STDIN,
## and write to STDOUT files that are detected as application files
## (implying binary executable files).
## NOTES:
## 1. 'grep .' ensures the exit code is 0 if any files are printed.
## 2. 'file -i' is not standard but common enough to be considered portable.
filter_binary_application_files()
{
    xargs file -i \
        | awk -F': *' '$2 ~ /^application\// {print $1}' \
        | grep .
}

## Read a list of files from STDIN (assumed to already be text files),
## and writes to STDOUT files which are 10 lines or longer.
## NOTES:
## 1. 'grep ... total' removes the last line from 'wc -l'
##    (if there was no than one file)
## 2. 'grep .' ensures the exit code is 0 if any files are printed.
filter_long_text_files()
{
    xargs wc -l \
        | grep -v '  *[0-9][0-9]* total' \
        | awk '$1>=10 {print $2}' \
        | grep .
}

## Reads a list of files from STDIN,
## writes the list to STDOUT, while removing files which are
## common in projects and do not carry a copyright or license statement.
remove_non_checked_files()
{
    perl -MFile::Basename -lne \
'BEGIN {
    @skip_files = qw/
			readme
			readme.md
			readme.txt
			readme.rst
			authors
			news
			copying
			gpl3.txt
			gpl2.txt
			gpl.txt
			hacking
			hacking.md
			thanks
			todo
			license
			license.txt
			.gitignore
			.gitattributtes
			.gitmodules
			.prev-version
                    /;
    %skip_files = map { $_ => 1 } @skip_files ;
}
$b = lc(basename($_));
exists $skip_files{$b} or print $_;
'
}

## Reads a list of files from STDIN,
## writes a list of files which are not detected as text or compiled binaries.
## NOTES:
## 1. 'grep .' ensures the exit code is 0 if any files are printed.
## 2. 'file -i' is not standard but common enough to be considered portable.
filter_binary_data_files()
{
    xargs file -i \
        | awk -F': *' '$2 !~ /^(text|application)\// {print $1}' \
	| grep .
}

## Copyright is tricky. This pattern finds many common cases.
## The "clear copyright" refers to the most common cases:
##  1. The word "Copyright" in English
##  2. Optional "(C)"
## and
##  3.1 A year number (possibly as "2013,2014" and "2009-2014"), followed
##      by few non numeric characters (hinting it's the copyright's holder
##      name or email)
## or
##  3.2 A long string (>20 characters) of digits,commas,minus, until the end
##      of the line (hinting the copyright years is a long list, and the author
##      knows what he/she is doing).
##  3.3 "@copyright{}" as it appears in texinfo files.
##
## extreme cases:
##   coreutils' ./man/help2man
##   gnulib's   ./lib/strtol.c   ./lib/fstrcmp.c
##   gawk's     test/wideidx2.awk   and ./vms/*.com
##   po/* and man/*.x files
## and many many more
filter_missing_clear_copyright_files()
{
    remove_non_checked_files \
        | xargs -n 1 awk \
'/[Cc]opyright  *(\([Cc]\))? *(19|20)[0-9][0-9 ,\-]+[A-Za-z\(< ]{4}/ \
 || \
 /[Cc]opyright  *(\([Cc]\))? *(19|20)[0-9 ,\-]{16,}+$/ \
 || \
 /[Cc]opyright  *@copyright\{\}  *(19|20)[0-9][0-9]/ \
    { COPYRIGHT_FOUND=1 }
END { if (COPYRIGHT_FOUND==0) { print FILENAME }}' \
        | grep .
}

## list files which have the word 'copyright' in them.
## This is a last-resort for files that did not have a recognizable
## 'clear' copyright statement.
filter_copyright_word_files()
{
    remove_non_checked_files \
        | xargs -n 1 grep -wli 'copyright' \
        | grep .
}

## list files which do not even have the word 'copyright' in them.
## NOTE:
##   "grep -L" is not POSIX standard, but common enough to be used.
filter_copyright_word_files()
{
    remove_non_checked_files \
        | xargs -n 1 grep -wLi 'copyright' \
        | grep .
}

#'NR<=MAXLINE && ( \
# /[Cc]opyright  *(\([Cc]\))? *(19|20)[0-9 ,\-]+[A-Za-z\(< ]{4}/ \
# || \
#{ COPYRIGHT_FOUND=1 }
#NR<=MAXLINE && /[Cc]opyright  *@copyright\{\}  *(19|20)[0-9][0-9]/ { COPYRIGHT_FOUND=1 }
#END { if (COPYRIGHT_FOUND==0) { print FILENAME }}' | grep .

## Detecting exact license is tricky.
## This function tries to detect common phrases of few common free licenses.
##
## Reads a list of files from STDIN,
## For each file, reads the first 20 lines, concatanates them,
## removes some extra characters (comments, etc.), then
## tries to detect common phrases that might hint that the
## file has a free license statement.
##
filter_missing_license_files()
{
    remove_non_checked_files |
    xargs -n 1 awk -v MAXLINE=20 \
'BEGIN { text = "" }
NR<MAXLINE {
    gsub(/^dnl/,"")
    gsub(/\t/," ")
    gsub(/\#/,"")
    gsub(/\-\-/,"")
    gsub(/\*/,"")
    gsub(/^\/\//,"")
    text = text " " $0
}
END {
    gsub(/  */," ",text)

    POSSIBLE_GPL_FOUND = \
       text ~ /is free software[:;] you can redistribute it and\/or modify it under the terms of the GNU (Lesser|Affero)? *General Public License/

    POSSIBLE_SPECIAL_EXCEPTION_FOUND = \
       text ~ /is free software; as a special exception the author gives unlimited permission/

    POSSIBLE_BSD_FOUND = \
        (text ~ /Redistributions of source code must retain the above copyright notice/ &&
         text ~ /Redistributions in binary form must reproduce the above copyright notice/ )

    POSSBILE_X11_FOUND = \
        text ~ /Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files/

    POSSIBLE_PERL_FOUND = \
        text ~ /This library is free software; you can redistribute it and\/or modify it under the same terms as Perl itself/

    POSSIBLE_FOUND = ( POSSIBLE_GPL_FOUND ||
        POSSIBLE_SPECIAL_EXCEPTION_FOUND ||
        POSSIBLE_BSD_FOUND ||
        POSSBILE_X11_FOUND ||
        POSSIBLE_PERL_FOUND)

    if (!POSSIBLE_FOUND) {
        print FILENAME
    }
}'
}


## These are common files containing plain text license,
## as recommended here:
##    http://savannah.gnu.org/maintenance/HowToGetYourProjectApprovedQuickly/
##
## TODO: should we 'nudge' towards always having 'COPYING' file?
##
## NOTE:
##   The content of the file is NOT checked.
## If someone created a file named 'GPL.TXT' and it doesn't contain the actual
## license, the GNU Savannah review will need to catch this.
find_plaintext_license_file()
{
    grep -i -E '(COPYING|LICENSE|GPL|GPL2|GPL3|AGPL|LGPL|BSD|MIT)(\.txt)?' |
        grep -q .
}

## Reads a list of files from STDIN,
## writes to STDOUT files which mention 'linux' but possibly mean 'GNU/Linux'.
##
## If the source files mention the term Linux, advise the user
## that perhaps 'GNU/Linux' is more appropriate.
##
## NOTES:
## The following cases are discarded:
## 1. If the word 'kernel' is mentioned on the same line - the context is
##    likely the linux as a kernel.
## 2. XXX-linux-XXXX - very likely refers to a host/build/target triplet
##    (e.g. powerpc-linux-gnueabi). Skip it.
## 3. -*linux* = appears often in configure/config.guess as OS/ABI detection.
## 4. Linux followed by version (e.g. 'Linux 2.9.6') - The context is
##    likely the linux as a kernel.
## 5. If "GNU/Linux" appears on the same line.
##
## Hack note:
##  the first 'grep' uses '-n' which implies that the line nubmer AND the
##  file name will be printed - this will work only if there is more than one
##  text file in the project. If there's only one file, only the offending
##  line number (without the filename) will be printed. Oh well.
##  GNU grep's "-H" will make it better, but it's not portable.
find_term_linux()
{
    xargs grep -w -i -n 'linux' | \
        grep -vi 'kernel' |
        grep -v '[a-z][a-z]*-linux-[a-z][a-z]*' |
        grep -v -- '-\*linux\*' |
        grep -vi 'linux[- ][0-9]\.[0-9][0-9]*' |
        grep -vi 'gnu/linux'
}

check_start()
{
  echo "** checking: $@ ..."
}

exit_with_help_message()
{
    echo "
** Please try and fix the above errors, and re-run this script.

   For more information, please see:
     https://savannah.gnu.org/register/requirements.php
     http://savannah.gnu.org/maintenance/HowToGetYourProjectApprovedQuickly

   If you have any questions, please send an email to:
     savannah-help-public@gnu.org
"
    found_errors=1
}

check_fail_whitespace_in_fail_names()
{
    echo "
** check failed: the files listed above contain whitespace characters
   (Space, Tab, NULL or other non-printable characters).
   While not a strict requirement, such files tend to cause problems on many
   computer systems, and are better avoided.
   This evaluation script can not verify such files, and will abort now.
"
    found_errors=1
}

check_fail_binary_app_files()
{
    echo "
** check failed: the files listed above are binary application files
   (usually, compiled object/executable files)
   While not a strict requirement, such files are usually created when compiling
   the source code in your package, and there's no need to include them.
"
    found_errors=1
}

check_fail_missing_copyright_files()
{
    echo "
** check failed: the files listed above do not carry a recognizable copyright
   statement in the first 20 lines of the file.
   A recognizable copyright statement in this script should be of form:
      Copyright (C) YEAR NAME-OF-COPYRIGHT-HOLDER

   If these are data files or auto-generated files, usually mentioning it
   in the README file is acceptable. Otherwise, please add a copyright statement
   to these files.

   More information here:
      http://www.gnu.org/licenses/gpl-howto.html

   Projects hosted on Savannah must carry a copyright statement.
"
    found_errors=1
}

check_fail_missing_license_files()
{
    echo "
** check failed: the files listed above do not carry a recognizable
   Free Software license statement.

   It is possible these files contain a valid yet unrecognizable form of the
   license statement (usually due to different formatting).

   See here for a list of recommended free-software licenses:
      http://www.gnu.org/licenses/license-list.html#GPLCompatibleLicenses
"
    found_errors=1
}

check_fail_missing_plaintext_license_file()
{
    echo "
** check failed: a recognizable plain-text license file was not detected.
   Such file is commonly named 'COPYING' or 'LICENSE'.
   While not a strict requirement, it is highly recommended to include a
   plain-text license file in your project. See here for more details:
      http://savannah.gnu.org/maintenance/HowToGetYourProjectApprovedQuickly/
"
    found_errors=1
}

check_warn_binary_data_files()
{
    echo "
** check warning: the files listed above are binary data files.
   Such files usually can not carry a copyright and license statements.
   Please ensure your project have a README file specifying a copyright
   and license statement for these files.
"
    found_errors=1
}

check_warn_term_linux()
{
    echo "
** check warning: the files listed above use the term 'Linux'.
   Linux is a Kernel, not a whole operating system. Please consider using
   the correct term 'GNU/Linux' if the context is operating system and not
   Kernel-specific topic.
   See here:
      http://www.gnu.org/gnu/why-gnu-linux.html
      http://www.gnu.org/gnu/linux-and-gnu.html
"
    found_errors=1
}

##
## Step 0
##
DIR=$1
if [ -n "$DIR" ] ; then
    echo "Changing to '$DIR'"
    cd "$DIR" || exit 1
fi

##
## Step 1: avoid non-printable characters
##
FILES=$(list_files)

check_start "valid characters in filenames"
echo "$FILES" | find_bad_filenames &&
    check_fail_whitespace_in_fail_names

##
## Step 2: avoid binary files
##
check_start "binary application files"
echo "$FILES" | filter_binary_application_files &&
    check_fail_binary_app_files

##
## Step 3: Require copyright information in files
##
check_start "copyright statement"
echo "$FILES" | filter_text_files |
    filter_long_text_files | filter_missing_clear_copyright_files &&
        check_fail_missing_copyright_files

##
## Step 4: require free license
##
check_start "free software license statement"
echo "$FILES" | filter_text_files |
    filter_long_text_files | filter_missing_license_files &&
        check_fail_missing_license_files

##
## Step 5: Look for plain-text license file
##
check_start "plain-text license file"
echo "$FILES" | filter_text_files |
    filter_long_text_files | find_plaintext_license_file ||
        check_fail_missing_plaintext_license_file

##
## Step 6: Look for other binary files
##
check_start "binary data files"
echo "$FILES" | filter_binary_data_files &&
    check_warn_binary_data_files

##
## Step 7: Look for misuse of the term 'linux'
##
check_start "possible misuse of the term Linux"
echo "$FILES" | filter_text_files | find_term_linux &&
    check_warn_term_linux

test $found_errors -ne 0 && exit_with_help_message