spell check: add ...:…

and a few fixes :) handle #spellok in shell or cmake files allow to exclude scripts from search files
qgis · Aug 12, 2017 · b00a60d · b00a60d · nyalldawson · Aug 13, 2017
1 parent 5fd19d5
commit b00a60d
Show file tree

Hide file tree

Showing 2 changed files with 74 additions and 25 deletions.
diff --git a/scripts/spell_check/check_spelling.sh b/scripts/spell_check/check_spelling.sh
@@ -22,6 +22,9 @@
 #   set -x
 # fi
 
+# extensions or files that should be excluded from file list if :% is appended in the spelling.dat file
+EXCLUDE_SCRIPT_LIST='(\.(xml|svg|sip|t2t|pl|sh|qgs|badquote|cmake(\.in)?)|^(debian/copyright|cmake_templates/.*|INSTALL|NEWS|tests/testdata/labeling/README.rst|tests/testdata/font/QGIS-Vera/COPYRIGHT.TXT|doc/(news|INSTALL)\.html))$'
+
 DIR=$(git rev-parse --show-toplevel)/scripts/spell_check
 
 AGIGNORE=${DIR}/.agignore
@@ -67,10 +70,10 @@ else
 fi
 
 # regex to find escape string
-SPELLOKRX='(#\s*spellok|<!--\s*#\s*spellok\s*-->)$'
+SPELLOKRX='(#\s*spellok|<!--\s*#\s*spellok\s*-->)'
 
 # split into several files to avoid too long regexes
-SPLIT=6
+SPLIT=8
 
 ${GP}split --number=l/$SPLIT --numeric-suffixes --suffix-length=2 --additional-suffix=~ ${DIR}/spelling.dat spelling
 
@@ -84,25 +87,26 @@ ERRORFOUND=NO
 for I in $(seq -f '%02g' 0  $(($SPLIT-1)) ) ; do
   ( [[ "$INTERACTIVE" =~ YES ]] || [[ "$TRAVIS" =~ true ]] ) && printf "Progress: %d/%d\r" $(( I + 1 )) $SPLIT
   SPELLFILE=spelling$I~
+  ${GP}sed -i '/^#/d' $SPELLFILE
 
   # if correction contains an uppercase letter and is the same as the error character wise, this means that the error is searched as a full word and case sensitive (not incorporated in a bigger one)
   CASEMATCH_FIXCASE=$(${GP}sed -rn '/^(\w+):\1(:\*)?$/Ip' $SPELLFILE | ${GP}sed -r 's/^(\w+):\1(:\*)?$/(\\b|_)\1(\\b|_)/I')
   REMAINS=$(          ${GP}sed -r  '/^(\w+):\1(:\*)?$/Id' $SPELLFILE)
 
   # for error or correction containing any non letter character (space, apostrophe) search is full word and case insensitive
-  IGNORECASE_FIXSPECIALCHAR=$(echo "$REMAINS" | perl -ne "print if     /^(\w*[ '])*\w*:\w+(?(1)|[' ])/" | ${GP}sed -r 's/^(.*?):.*?(:\*)?$/(\\b|_)\1(\\b|_)/' )
-  REMAINS=$(                  echo "$REMAINS" | perl -ne "print if not /^(\w*[ '])*\w*:\w+(?(1)|[' ])/")
+  IGNORECASE_FIXSPECIALCHAR=$(echo "$REMAINS" | perl -ne " print if     /^(\w*[ '.…])*\w*:\w*(?(1)|[ '.…])/" | ${GP}sed -r 's/(^[.]+?):.*?(:[*%])?$/(\\b|_|^| )\1(\\b|_|$| )/g' | ${GP}sed -r 's/\./\\./g' | ${GP}sed -r 's/^(\w.*?):.*?(:[*%])?$/(\\b|_)\1(\\b|_)/' )
+  REMAINS=$(                  echo "$REMAINS" | perl -ne " print if not /^(\w*[ '.…])*\w*:\w*(?(1)|[ '.…])/")
 
   # This will try to look for misspelling within larger words.
   # Condition is hard to explain in words.
-  # You can test it here: https://regex101.com/r/7kznVA/12
+  # You can test it here: https://regex101.com/r/7kznVA/14
   # adding :* in spelling.dat willextra words that should not be checked in longer words ca
   # remove those in spelling.dat ending with :*
   # following can be checked in longer words case insensitively
   IGNORECASE_INWORD=$(echo "$REMAINS" | perl -ne 'print if     /^(\w)(\w)(\w)\w*(\w)(\w)(\w):(?:(?!\2\3\w|\w\1\2).)\w*?(?:(?!\5\6\w|\w\4\5)\w\w\w)$/' | cut -d: -f1 )
   REMAINS=$(          echo "$REMAINS" | perl -ne 'print if not /^(\w)(\w)(\w)\w*(\w)(\w)(\w):(?:(?!\2\3\w|\w\1\2).)\w*?(?:(?!\5\6\w|\w\4\5)\w\w\w)$/' | cut -d: -f1 )
   # Trying with the rest as whole words case insensitively
-  IGNORECASE_WHOLEWORD=$(echo "$REMAINS" | ${GP}sed -r 's/^/(\\b|_)/; s/$/(\\b|_)/' )
+  IGNORECASE_WHOLEWORD=$(echo "$REMAINS" | ${GP}sed -r 's/^(.+)$/(\\b|_)\1(\\b|_)/')
   # or in camel case, case sensitively for word of at least 4 chars
   MATCHCASE_INWORD=$(echo "$REMAINS" | ${GP}sed -r '/^.{,3}$/d' | ${GP}sed -r 's/^(\w)(.*)/(\\b|_)(\l\1\L\2_|\u\1\U\2_|\u\1\L\2\U[_A-Z0-9])|\L[_a-z0-9]\u\1\L\2(\\b|\U[_A-Z0-9])|\L[_a-z0-9]\u\1\U\2\L(\\b|[_a-z0-9])/' )
 
@@ -126,14 +130,26 @@ for I in $(seq -f '%02g' 0  $(($SPLIT-1)) ) ; do
   IGNORECASE_WHOLEWORD=$(     echo "$IGNORECASE_WHOLEWORD"      | ${GP}sed -r '/^\s*$/d' | tr '\n' '\|' | ${GP}sed -r 's/\|$//')
   MATCHCASE_INWORD=$(         echo "$MATCHCASE_INWORD"          | ${GP}sed -r '/^\s*$/d' | tr '\n' '\|' | ${GP}sed -r 's/\|$//')
 
-  IGNORECASE=$(echo "(${IGNORECASE_FIXSPECIALCHAR}|${IGNORECASE_INWORD}|${IGNORECASE_WHOLEWORD})" |${GP}sed -r 's/\(\|/(/' |${GP}sed -r 's/\|\|/|/g' |${GP}sed -r 's/\|\)/)/')'(?!.*'"${SPELLOKRX}"')'
-  CASEMATCH=$(echo "(${CASEMATCH_FIXCASE}|${MATCHCASE_INWORD})" |${GP}sed -r 's/\(\|/(/' |${GP}sed -r 's/\|\|/|/g' |${GP}sed -r 's/\|\)/)/')'(?!.*'"${SPELLOKRX}"')'
+  IGNORECASE=$(echo "(${IGNORECASE_FIXSPECIALCHAR}|${IGNORECASE_INWORD}|${IGNORECASE_WHOLEWORD})" | ${GP}sed -r 's/\(\|/(/' | ${GP}sed -r 's/\|\|/|/g' | ${GP}sed -r 's/\|\)/)/' | ${GP}sed -r 's/^\(\)$//')
+  CASEMATCH=$(echo "(${CASEMATCH_FIXCASE}|${MATCHCASE_INWORD})" | ${GP}sed -r 's/\(\|/(/' |${GP}sed -r 's/\|\|/|/g' | ${GP}sed -r 's/\|\)/)/' | ${GP}sed -r 's/^\(\)$//')
+
+  RUN_IGNORECASE=OFF
+  RUN_CASEMATCH=OFF
+
+  if [[ ! -z "${IGNORECASE}" ]]; then
+    RUN_IGNORECASE=ON
+  fi
+  if [[ ! -z "${CASEMATCH}"  ]]; then
+    RUN_CASEMATCH=ON
+  fi
+
+  IGNORECASE=$IGNORECASE'(?!.*'"${SPELLOKRX}"')'
+  CASEMATCH=$CASEMATCH'(?!.*'"${SPELLOKRX}"')'
 
   FILE=$INPUTFILES  # init with input files (if ag is run with single file, file path is not written in output)
 
   while read -u 3 -r LINE; do
     echo -e "$LINE"
-    ERRORFOUND=YES
     NOCOLOR=$(echo "$LINE" | ${GP}sed -r 's/\x1B\[([0-9]{1,2}(;[0-9]{1,2})?)?[m|K]//g')
     if [[ "$NOCOLOR" =~ ^[[:alnum:]][[:alnum:]\/\._-]+$ ]]; then
       FILE=$NOCOLOR
@@ -147,16 +163,22 @@ for I in $(seq -f '%02g' 0  $(($SPLIT-1)) ) ; do
       NUMBER=$(echo "$NOCOLOR" | cut -d: -f1)
       ERRORLINE=$(echo "$NOCOLOR" | cut -d: -f2)
       ERROR=$(echo "$LINE" | ${GP}sed -r 's/^.*?\x1B\[30;43m(.*?)\x1B\[0m.*?$/\1/')
-      PREVCHAR=$(echo "$LINE" | cut -d: -f2 | ${GP}sed -r 's/^(.*?)\x1B\[30;43m.*?\x1B\[0m.*?$/\1/' | ${GP}sed -r 's/\x1B\[([0-9]{1,2}(;[0-9]{1,2})?)?[m|K]//g' | tail -c 2)
-      NEXTCHAR=$(echo "$LINE" | cut -d: -f2 | ${GP}sed -r 's/^.*?\x1B\[30;43m.*?\x1B\[0m(.*?)$/\1/' | ${GP}sed -r 's/\x1B\[([0-9]{1,2}(;[0-9]{1,2})?)?[m|K]//g' | head -c 1)
+      PREVCHAR=$(echo "$LINE" | cut -d: -f2- | ${GP}sed -r 's/^(.*?)\x1B\[30;43m.*?\x1B\[0m.*?$/\1/' | ${GP}sed -r 's/\x1B\[([0-9]{1,2}(;[0-9]{1,2})?)?[m|K]//g' | tail -c 2)
+      NEXTCHAR=$(echo "$LINE" | cut -d: -f2- | ${GP}sed -r 's/^.*?\x1B\[30;43m.*?\x1B\[0m(.*?)$/\1/' | ${GP}sed -r 's/\x1B\[([0-9]{1,2}(;[0-9]{1,2})?)?[m|K]//g' | head -c 1)
+
+      if [[ "$DEBUG" =~ YES ]]; then
+        echo "prev char: $PREVCHAR"
+        echo "next char: $NEXTCHAR"
+      fi
 
       ERRORNOCOLOR=$(echo "$ERROR" | ${GP}sed -r 's/\x1B\[([0-9]{1,2}(;[0-9]{1,2})?)?[m|K]//g')
+
       if [[ "$ERRORNOCOLOR" =~ ^[[:digit:]]+: ]]; then
         echo "*** error: could not find error in $LINE" >&2
       else
         # if the error is not in IGNORECASE_INWORD, then it matched previous and next character (needs to remove them)
         # also make error small case and escape special chars: () |
-        ERRORSMALLCASE=$(echo ${ERROR,,} |${GP}sed -r 's/\(/\\(/g' |${GP}sed -r 's/\)/\\)/g' |${GP}sed -r 's/\|/\\|/g')
+        ERRORSMALLCASE=$(echo ${ERRORNOCOLOR,,} |${GP}sed -r 's/\(/\\(/g' | ${GP}sed -r 's/\)/\\)/g' | ${GP}sed -r 's/\|/\\|/g' )
         if [[ ! "${ERRORSMALLCASE}" =~ $IGNORECASE_INWORD ]]; then
          if [[ -n $(ag --nonumbers --case-sensitive "^${ERRORSMALLCASE:1:-1}${ERRORSMALLCASE: -1}?:" scripts/spell_check/spelling.dat) ]]; then
            PREVCHAR=${ERROR::1}
@@ -171,13 +193,31 @@ for I in $(seq -f '%02g' 0  $(($SPLIT-1)) ) ; do
            ERROR=${ERROR::-1}
          fi
         fi
+        ERRORSMALLCASE=$(${GP}sed -r 's/\./\\./g' <<< $ERRORSMALLCASE)
+
         # get correction from spelling.dat
         CORRECTION=$(ag --nonumbers --case-sensitive "^${ERRORSMALLCASE}:" ${DIR}/spelling.dat | cut -d: -f2)
+        # exclude script files
+        if [[ "$(ag --nonumbers --case-sensitive "^${ERRORSMALLCASE}:" ${DIR}/spelling.dat | cut -d: -f3)" =~ "%" ]]; then
+          if [[ "$FILE" =~ $EXCLUDE_SCRIPT_LIST ]]; then
+            echo "skipping script file for $(${GP}sed -r 's/\\//g' <<< $ERRORSMALLCASE)"
+            continue
+          fi
+        fi
 
         if [[ -z "$CORRECTION" ]]; then
-          CORRECTION=$(perl -e "use strict; use warnings; while(<>) { chop; my(\$a,\$b) = split /:/; \$a = qr(\$a); if( my @matches = '${ERRORSMALLCASE}' =~ /^\$a\$/i ) { print sprintf(\$b, @matches); last; }}" ${DIR}/spelling.dat)
+          CORRECTION=$(perl -e "use strict; use warnings; while(<>) { chop; my(\$a,\$b) = split /:/; \$a = qr(\$a); if( my @matches = '${ERRORSMALLCASE}' =~ /^\$a\$/i ) { print sprintf(\$b, @matches); last; }}" ${DIR}/spelling.dat )
+          # exclude script files
+          if [[ "$(ag --nonumbers --case-sensitive ":${CORRECTION}" ${DIR}/spelling.dat | cut -d: -f3)" =~ "%" ]]; then
+            if [[ "$FILE" =~ $EXCLUDE_SCRIPT_LIST ]]; then
+              echo "skipping script file for $(${GP}sed -r 's/\\//g' <<< $ERRORSMALLCASE)"
+              continue
+            fi
+          fi
         fi
 
+        ERRORFOUND=YES
+
         if [[ -z "$CORRECTION" ]]; then
           echo "could not find correction for $ERROR" >&2
         else
@@ -202,14 +242,10 @@ for I in $(seq -f '%02g' 0  $(($SPLIT-1)) ) ; do
               SPELLOKSTR='//#spellok'
               if [[ "$FILE" =~ \.(txt|html|htm|dox)$ ]]; then
                 SPELLOKSTR='<!--#spellok-->'
-              fi
-              if [[ "$FILE" =~ \.(h|cpp|sip)$ ]]; then
-                if [[ "$ERRORLINE" =~ ^\s*(\/*\|\/\/) ]]; then
+              elif [[ "$FILE" =~ \.(h|cpp|sip)$ ]] && [[ "$ERRORLINE" =~ ^\s*(\/*\|\/\/) ]]; then
                   # line is already commented
                   SPELLOKSTR='#spellok'
-                fi
-              fi
-              if [[ "$FILE" =~ \.(py)$ ]]; then
+              elif [[ "$FILE" =~ \.(py|pl|sh|cmake(\.in)?)$ ]]; then
                 SPELLOKSTR='#spellok'
               fi
               SPELLOKSTR_ESC=$(echo "$SPELLOKSTR" | ${GP}sed -r 's/\//\\\//g')
@@ -226,25 +262,34 @@ for I in $(seq -f '%02g' 0  $(($SPLIT-1)) ) ; do
               echo -e "  o) skip all \x1B[4mo\x1B[0mccurences and continue"
               echo -e "  e) \x1B[4me\x1B[0mxit"
 
+              TOREPLACE=$(${GP}sed -r 's/([.\[/\]])/\\\1/g' <<< "${PREVCHAR}${ERROR}${NEXTCHAR}")
+              PREVCHAR=$(${GP}sed -r 's/\//\\\//g' <<< "${PREVCHAR}")
+              NEXTCHAR=$(${GP}sed -r 's/\//\\\//g' <<< "${NEXTCHAR}")
+
+              if [[ "$DEBUG" =~ YES ]]; then
+                echo "__${PREVCHAR}__${ERROR}__${NEXTCHAR}__"
+                echo "${NUMBER}s/$TOREPLACE/${PREVCHAR}$CORRECTIONCASE${NEXTCHAR}/g"
+              fi
+
               while read -n 1 n; do
                 echo ""
                 case $n in
                     r)
                       echo -e "replacing \x1B[33m$ERROR\x1B[0m by \x1B[33m$CORRECTIONCASE\x1B[0m in \x1B[33m$FILE\x1B[0m at line \x1B[33m$NUMBER\x1B[0m"
-                      ${GP}sed -i "${NUMBER}s/${PREVCHAR}${ERROR}${NEXTCHAR}/${PREVCHAR}$CORRECTIONCASE${NEXTCHAR}/g" $FILE
+                      ${GP}sed -i "${NUMBER}s/$TOREPLACE/${PREVCHAR}$CORRECTIONCASE${NEXTCHAR}/g" $FILE
                       break
                       ;;
                     f)
                       GLOBREP_CURRENTFILE+=(["$ERROR"]=1)
                       echo -e "replacing \x1B[33m$ERROR\x1B[0m by \x1B[33m$CORRECTIONCASE\x1B[0m in \x1B[33m$FILE\x1B[0m"
-                      ${GP}sed -i -r "/${SPELLOKRX}/! s/${PREVCHAR}${ERROR}${NEXTCHAR}/${PREVCHAR}$CORRECTIONCASE${NEXTCHAR}/g" $FILE
+                      ${GP}sed -i -r "/${SPELLOKRX}/! s/$TOREPLACE/${PREVCHAR}$CORRECTIONCASE${NEXTCHAR}/g" $FILE
                       break
                       ;;
                     a)
                       GLOBREP_CURRENTFILE+=(["$ERROR"]=1)
                       GLOBREP_ALLFILES+=(["$ERROR"]=1)
                       echo -e "replace \x1B[33m$ERROR\x1B[0m by \x1B[33m$CORRECTIONCASE\x1B[0m in \x1B[33m$FILE\x1B[0m"
-                      ${GP}sed -i -r "/${SPELLOKRX}/! s/${PREVCHAR}${ERROR}${NEXTCHAR}/${PREVCHAR}$CORRECTIONCASE${NEXTCHAR}/g" $FILE
+                      ${GP}sed -i -r "/${SPELLOKRX}/! s/$TOREPLACE/${PREVCHAR}$CORRECTIONCASE${NEXTCHAR}/g" $FILE
                       break
                       ;;
                     p)
@@ -258,7 +303,7 @@ for I in $(seq -f '%02g' 0  $(($SPLIT-1)) ) ; do
                       MATCHCASE="$ERROR:$CORRECTION"
                       CORRECTIONCASE=$(echo "$MATCHCASE" | ${GP}sed -r 's/([A-Z]+):(.*)/\1:\U\2/; s/([A-Z][a-z]+):([a-z])/\1:\U\2\L/' | cut -d: -f2)
                       echo -e "replacing \x1B[33m$ERROR\x1B[0m by \x1B[33m$CORRECTIONCASE\x1B[0m in \x1B[33m$FILE\x1B[0m at line \x1B[33m$NUMBER\x1B[0m"
-                      ${GP}sed -i "${NUMBER}s/${PREVCHAR}${ERROR}${NEXTCHAR}/${PREVCHAR}$CORRECTIONCASE${NEXTCHAR}/g" $FILE
+                      ${GP}sed -i "${NUMBER}s/$TOREPLACE/${PREVCHAR}$CORRECTIONCASE${NEXTCHAR}/g" $FILE
                       break
                       ;;
                     c)
@@ -283,11 +328,12 @@ for I in $(seq -f '%02g' 0  $(($SPLIT-1)) ) ; do
       fi
     fi
   done 3< <(
-    unbuffer ag --noaffinity --all-text --nopager --color-match "30;43" --numbers --nomultiline --ignore-case    -p $AGIGNORE "${IGNORECASE}" $INPUTFILES
-    unbuffer ag --noaffinity --all-text --nopager --color-match "30;43" --numbers --nomultiline --case-sensitive -p $AGIGNORE "${CASEMATCH}"  $INPUTFILES
+    [[ "$RUN_IGNORECASE" == "ON" ]] && unbuffer ag --noaffinity --all-text --nopager --color-match "30;43" --numbers --nomultiline --ignore-case    -p $AGIGNORE "${IGNORECASE}" $INPUTFILES ;
+    [[ "$RUN_CASEMATCH" == "ON" ]] &&  unbuffer ag --noaffinity --all-text --nopager --color-match "30;43" --numbers --nomultiline --case-sensitive -p $AGIGNORE "${CASEMATCH}"  $INPUTFILES
   )
 
   rm -f $SPELLFILE
+
 done
 
 ( [[ "$INTERACTIVE" =~ YES ]] || [[ "$TRAVIS" =~ true ]] ) && echo

diff --git a/scripts/spell_check/spelling.dat b/scripts/spell_check/spelling.dat
@@ -1,3 +1,6 @@
+# append :* to avoid in word check_spelling
+# append :% to exclude script files
+#...:…:%
 abailable:available
 abandonded:abandoned
 abandonned:abandoned