[HIVEMALL-316] Improve error message for duplicate entries error in Tokenizer user dictionary
## What changes were proposed in this pull request?
Improve error message for duplicate entries error in Tokenizer user dictionary
## What type of PR is it?
Improvement
## What is the Jira issue?
https://issues.apache.org/jira/browse/HIVEMALL-316
## Checklist
(Please remove this section if not needed; check `x` for YES, blank for NO)
- [x] Did you apply source code formatter, i.e., `./bin/format_code.sh`, for your commit?
- [ ] Did you run system tests on Hive (or Spark)?
Author: Makoto Yui <myui@apache.org>
Closes #245 from myui/HIVEMALL-316.
diff --git a/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiNEologdUDF.java b/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiNEologdUDF.java
index b41d4dc..39ea743 100644
--- a/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiNEologdUDF.java
+++ b/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiNEologdUDF.java
@@ -383,8 +383,13 @@
return UserDictionary.open(reader); // return null if empty
} catch (Throwable e) {
throw new UDFArgumentException(
- "Failed to parse the file in CSV format (UTF-8 encoding is expected): "
- + userDictURL + '\n' + ExceptionUtils.prettyPrintStackTrace(e));
+ "Failed to parse the dictionary CSV file: " + userDictURL + '\n'
+ + "Please ensure that \n"
+ + " 1) file encoding is UTF-8, \n"
+ + " 2) no duplicate entry.\"\n"
+ + " 3) the maximum dictionary size is limited to 32MB (SHOULD be compressed using gzip with .gz suffix)\n"
+ + " 4) read timeout is set to 60 sec and connection must be established in 10 sec.\n"
+ + ExceptionUtils.prettyPrintStackTrace(e));
}
}
diff --git a/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiUDF.java b/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiUDF.java
index 07059b2..e84488e 100644
--- a/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiUDF.java
+++ b/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiUDF.java
@@ -383,8 +383,13 @@
return UserDictionary.open(reader); // return null if empty
} catch (Throwable e) {
throw new UDFArgumentException(
- "Failed to parse the file in CSV format (UTF-8 encoding is expected): "
- + userDictURL + '\n' + ExceptionUtils.prettyPrintStackTrace(e));
+ "Failed to parse the dictionary CSV file: " + userDictURL + '\n'
+ + "Please ensure that \n"
+ + " 1) file encoding is UTF-8, \n"
+ + " 2) no duplicate entry.\"\n"
+ + " 3) the maximum dictionary size is limited to 32MB (SHOULD be compressed using gzip with .gz suffix)\n"
+ + " 4) read timeout is set to 60 sec and connection must be established in 10 sec.\n"
+ + ExceptionUtils.prettyPrintStackTrace(e));
}
}
diff --git a/nlp/src/main/java/hivemall/nlp/tokenizer/TokenizeKoUDF.java b/nlp/src/main/java/hivemall/nlp/tokenizer/TokenizeKoUDF.java
index fb61633..1486b7f 100644
--- a/nlp/src/main/java/hivemall/nlp/tokenizer/TokenizeKoUDF.java
+++ b/nlp/src/main/java/hivemall/nlp/tokenizer/TokenizeKoUDF.java
@@ -267,7 +267,6 @@
return stopTags;
}
-
@Nullable
private static UserDictionary userDictionary(@Nullable final String[] userDictArray)
throws UDFArgumentException {
@@ -375,8 +374,13 @@
return UserDictionary.open(reader); // return null if empty
} catch (Throwable e) {
throw new UDFArgumentException(
- "Failed to parse the file in CSV format (UTF-8 encoding is expected): "
- + userDictURL + '\n' + ExceptionUtils.prettyPrintStackTrace(e));
+ "Failed to parse the dictionary CSV file: " + userDictURL + '\n'
+ + "Please ensure that \n"
+ + " 1) file encoding is UTF-8, \n"
+ + " 2) no duplicate entry.\"\n"
+ + " 3) the maximum dictionary size is limited to 32MB (SHOULD be compressed using gzip with .gz suffix)\n"
+ + " 4) read timeout is set to 60 sec and connection must be established in 10 sec.\n"
+ + ExceptionUtils.prettyPrintStackTrace(e));
}
}