[HIVEMALL-316] Improve error message for duplicate entries error in Tokenizer user dictionary

## What changes were proposed in this pull request?

Improve error message for duplicate entries error in Tokenizer user dictionary

## What type of PR is it?

Improvement

## What is the Jira issue?

https://issues.apache.org/jira/browse/HIVEMALL-316

## Checklist

(Please remove this section if not needed; check `x` for YES, blank for NO)

- [x] Did you apply source code formatter, i.e., `./bin/format_code.sh`, for your commit?
- [ ] Did you run system tests on Hive (or Spark)?

Author: Makoto Yui <myui@apache.org>

Closes #245 from myui/HIVEMALL-316.
diff --git a/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiNEologdUDF.java b/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiNEologdUDF.java
index b41d4dc..39ea743 100644
--- a/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiNEologdUDF.java
+++ b/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiNEologdUDF.java
@@ -383,8 +383,13 @@
             return UserDictionary.open(reader); // return null if empty
         } catch (Throwable e) {
             throw new UDFArgumentException(
-                "Failed to parse the file in CSV format (UTF-8 encoding is expected): "
-                        + userDictURL + '\n' + ExceptionUtils.prettyPrintStackTrace(e));
+                "Failed to parse the dictionary CSV file: " + userDictURL + '\n'
+                + "Please ensure that \n"
+                + "  1) file encoding is UTF-8, \n"
+                + "  2) no duplicate entry.\"\n"
+                + "  3) the maximum dictionary size is limited to 32MB (SHOULD be compressed using gzip with .gz suffix)\n"
+                + "  4) read timeout is set to 60 sec and connection must be established in 10 sec.\n"
+                        +  ExceptionUtils.prettyPrintStackTrace(e));
         }
     }
 
diff --git a/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiUDF.java b/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiUDF.java
index 07059b2..e84488e 100644
--- a/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiUDF.java
+++ b/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiUDF.java
@@ -383,8 +383,13 @@
             return UserDictionary.open(reader); // return null if empty
         } catch (Throwable e) {
             throw new UDFArgumentException(
-                "Failed to parse the file in CSV format (UTF-8 encoding is expected): "
-                        + userDictURL + '\n' + ExceptionUtils.prettyPrintStackTrace(e));
+                "Failed to parse the dictionary CSV file: " + userDictURL + '\n'
+                + "Please ensure that \n"
+                + "  1) file encoding is UTF-8, \n"
+                + "  2) no duplicate entry.\"\n"
+                + "  3) the maximum dictionary size is limited to 32MB (SHOULD be compressed using gzip with .gz suffix)\n"
+                + "  4) read timeout is set to 60 sec and connection must be established in 10 sec.\n"
+                        +  ExceptionUtils.prettyPrintStackTrace(e));
         }
     }
 
diff --git a/nlp/src/main/java/hivemall/nlp/tokenizer/TokenizeKoUDF.java b/nlp/src/main/java/hivemall/nlp/tokenizer/TokenizeKoUDF.java
index fb61633..1486b7f 100644
--- a/nlp/src/main/java/hivemall/nlp/tokenizer/TokenizeKoUDF.java
+++ b/nlp/src/main/java/hivemall/nlp/tokenizer/TokenizeKoUDF.java
@@ -267,7 +267,6 @@
         return stopTags;
     }
 
-
     @Nullable
     private static UserDictionary userDictionary(@Nullable final String[] userDictArray)
             throws UDFArgumentException {
@@ -375,8 +374,13 @@
             return UserDictionary.open(reader); // return null if empty
         } catch (Throwable e) {
             throw new UDFArgumentException(
-                "Failed to parse the file in CSV format (UTF-8 encoding is expected): "
-                        + userDictURL + '\n' + ExceptionUtils.prettyPrintStackTrace(e));
+                "Failed to parse the dictionary CSV file: " + userDictURL + '\n'
+                + "Please ensure that \n"
+                + "  1) file encoding is UTF-8, \n"
+                + "  2) no duplicate entry.\"\n"
+                + "  3) the maximum dictionary size is limited to 32MB (SHOULD be compressed using gzip with .gz suffix)\n"
+                + "  4) read timeout is set to 60 sec and connection must be established in 10 sec.\n"
+                        +  ExceptionUtils.prettyPrintStackTrace(e));
         }
     }