From 39e37aab077c31b75c80dd1553fb9acdfc10b1fc Mon Sep 17 00:00:00 2001
From: Florian Uhlig <f.uhlig@gsi.de>
Date: Wed, 12 Mar 2025 11:34:33 +0100
Subject: [PATCH] Check for non ASCII characters in CI

Recently it was found that some files contain non ASCII charaters. The added
CI tests checks for files which contain non ASCII characters. Test fails if
non ASCII characters are found in a commit.
---
 .gitlab-ci.yml                        | 18 +++++++++++
 scripts/check-non-ascii-characters.sh | 43 +++++++++++++++++++++++++++
 2 files changed, 61 insertions(+)
 create mode 100755 scripts/check-non-ascii-characters.sh

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index e5ed3d1aca..7785cf72d4 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -181,6 +181,24 @@ FileEndCheck:
     - git fetch upstream
     - scripts/check-file-ending.sh upstream
 
+NonASCIICharacterCheck:
+  stage: checkFormat
+  image: alpine
+  tags:
+    - docker
+  only:
+    refs:
+      - merge_requests
+    variables:
+      - $CI_MERGE_REQUEST_PROJECT_PATH == "computing/cbmroot" && $CI_MERGE_REQUEST_TARGET_BRANCH_NAME == "master"
+      - $CI_MERGE_REQUEST_PROJECT_PATH == "computing/cbmroot" && $CI_MERGE_REQUEST_TARGET_BRANCH_NAME =~ /^DC_.*$/
+  script:
+    # Get the upstream repository manually. I did not find any other way to have it for
+    # comparison
+    - apk update && apk add git bash file pcre-tools
+    - scripts/connect_upstream_repo.sh $CI_MERGE_REQUEST_PROJECT_URL
+    - git fetch upstream
+    - scripts/check-non-ascii-characters.sh upstream
 
 FileLicenceCheck:
   stage: checkFormat
diff --git a/scripts/check-non-ascii-characters.sh b/scripts/check-non-ascii-characters.sh
new file mode 100755
index 0000000000..d4778a9256
--- /dev/null
+++ b/scripts/check-non-ascii-characters.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+# Copyright (C) 2025 GSI Helmholtzzentrum fuer Schwerionenforschung, Darmstadt
+# SPDX-License-Identifier: GPL-3.0-only
+# First commited by Florian Uhlig
+
+
+if [[ $# -eq 1 ]]; then
+  UPSTREAM=$1
+else
+  if [ -z $UPSTREAM ]; then
+    UPSTREAM=$(git remote -v | grep git.cbm.gsi.de[:/]computing/cbmroot | cut -f1 | uniq)
+    if [ -z $UPSTREAM ]; then
+      echo "Error: Name of upstream repository not provided and not found by automatic means"
+      echo 'Please provide if by checking your remotes with "git remote -v" and exporting UPSTREAM'
+      echo "or passing as an argument"
+      exit -1
+    fi
+  fi
+fi
+echo "Upstream name is :" $UPSTREAM
+
+BASE_COMMIT=$UPSTREAM/master
+CHANGED_FILES=$(git diff --name-only $BASE_COMMIT)
+for file in $CHANGED_FILES; do
+  result=$(file $file | grep UTF-8)
+  if [[ "$result" != "" ]]; then
+    echo " "
+    echo "File $file contains non ASCII characters"
+    pcregrep --color='auto' -n "[\x80-\xFF]" $file
+    echo " "
+    okay=false
+  fi
+done
+
+if [[ "$okay" = "false" ]]; then
+  echo ""
+  echo "There are files which contain non ASCII characters"
+  echo "Test failed"
+  exit 1
+else
+  exit 0
+fi
+
-- 
GitLab