File name
Commit message
Commit date
2024-11-05
File name
Commit message
Commit date
2024-11-05
File name
Commit message
Commit date
File name
Commit message
Commit date
File name
Commit message
Commit date
File name
Commit message
Commit date
package itn.let.hangulparser;
import java.util.ArrayList;
import java.util.List;
/**
* HangulParser is to seperate Hangul to basic consonant and vowel by using Unicode
* @see HangulParserException
*
* ref : Hangul Syllables http://www.unicode.org/charts/PDF/UAC00.pdf
*/
public class HangulParser {
private static final String TAG = HangulParser.class.getSimpleName();
// First '가' : 0xAC00(44032), 끝 '힟' : 0xD79F(55199)
private static final int FIRST_HANGUL = 44032;
// 19 initial consonants
private static final char[] CHOSUNG_LIST = {
'ㄱ', 'ㄲ', 'ㄴ', 'ㄷ', 'ㄸ', 'ㄹ', 'ㅁ', 'ㅂ', 'ㅃ', 'ㅅ',
'ㅆ', 'ㅇ', 'ㅈ', 'ㅉ', 'ㅊ', 'ㅋ', 'ㅌ', 'ㅍ', 'ㅎ'
};
private static int JUNGSUNG_COUNT = 21;
// 21 vowels
private static final char[] JUNGSUNG_LIST = {
'ㅏ', 'ㅐ', 'ㅑ', 'ㅒ', 'ㅓ', 'ㅔ', 'ㅕ', 'ㅖ', 'ㅗ', 'ㅘ',
'ㅙ', 'ㅚ', 'ㅛ', 'ㅜ', 'ㅝ', 'ㅞ', 'ㅟ', 'ㅠ', 'ㅡ', 'ㅢ',
'ㅣ'
};
private static int JONGSUNG_COUNT = 28;
// 28 consonants placed under a vowel(plus one empty character)
private static final char[] JONGSUNG_LIST = {
' ', 'ㄱ', 'ㄲ', 'ㄳ', 'ㄴ', 'ㄵ', 'ㄶ', 'ㄷ', 'ㄹ', 'ㄺ',
'ㄻ', 'ㄼ', 'ㄽ', 'ㄾ', 'ㄿ', 'ㅀ', 'ㅁ', 'ㅂ', 'ㅄ', 'ㅅ',
'ㅆ', 'ㅇ', 'ㅈ', 'ㅊ', 'ㅋ', 'ㅌ', 'ㅍ', 'ㅎ'
};
public static List<String> disassemble(char hangul) throws Exception {
List<String> jasoList = new ArrayList<>();
String hangulStr = String.valueOf(hangul);
if(!hangulStr.equals("") && !hangulStr.equals(" ")) {
if (hangulStr.matches(".*[가-힣]+.*")) {
int baseCode = hangulStr.charAt(0) - FIRST_HANGUL;
final int chosungIndex = baseCode / (JONGSUNG_COUNT * JUNGSUNG_COUNT);
jasoList.add(Character.toString(CHOSUNG_LIST[chosungIndex]));
final int jungsungIndex = (baseCode - ((JONGSUNG_COUNT * JUNGSUNG_COUNT) * chosungIndex)) / JONGSUNG_COUNT;
jasoList.add(Character.toString(JUNGSUNG_LIST[jungsungIndex]));
final int jongsungIndex = (baseCode - ((JONGSUNG_COUNT * JUNGSUNG_COUNT) * chosungIndex) - (JONGSUNG_COUNT * jungsungIndex));
if (jongsungIndex > 0) {
jasoList.add(Character.toString(JONGSUNG_LIST[jongsungIndex]));
}
} else if (hangulStr.matches(".*[ㄱ-ㅎ]+.*")) {
jasoList.add(hangulStr);
//System.out.println("음절이 아닌 자음입니다 ::: " + hangulStr);
//throw new HangulParserException("음절이 아닌 자음입니다");
} else if (hangulStr.matches(".*[ㅏ-ㅣ]+.*")) {
jasoList.add(hangulStr);
//System.out.println("음절이 아닌 모음입니다");
//throw new HangulParserException("음절이 아닌 모음입니다");
} else {
jasoList.add(hangulStr);
//System.out.println("한글이 아닙니다" + hangulStr);
//throw new HangulParserException("한글이 아닙니다");
}
}
return jasoList;
}
public static List<String> disassemble(String hangul) throws Exception {
List<String> jasoList = new ArrayList<String>();
for (int i = 0, li = hangul.length(); i < li; i++) {
try {
jasoList.addAll(disassemble(hangul.charAt(i)));
} catch (Exception e) {
//System.out.println((i+1) + "번째 글자 분리 오류 : " + e.getMessage());
//throw new HangulParserException((i+1) + "번째 글자 분리 오류 : " + e.getMessage());
}
}
return jasoList;
}
public static String assemble(List<String> jasoList) throws Exception {
if (jasoList.size() > 0) {
String result = "";
int startIdx = 0;
while (true) {
if(startIdx < jasoList.size()) {
final int assembleSize = getNextAssembleSize(jasoList, startIdx);
result += assemble(jasoList, startIdx, assembleSize);
startIdx += assembleSize;
} else {
break;
}
}
return result;
} else {
return "자소가 없습니다.";
//throw new HangulParserException("자소가 없습니다");
}
}
private static String assemble(List<String> jasoList, final int startIdx, final int assembleSize) throws Exception {
int unicode = FIRST_HANGUL;
try {
if(jasoList.size() < startIdx) {
return "";
}
//문장에서 영문 또는 숫자가 나오면 그대로 리턴해준다.
if(jasoList.get(startIdx).matches(".*[a-zA-Z0-9]+.*")) {
return jasoList.get(startIdx);
}
final int chosungIndex = new String(CHOSUNG_LIST).indexOf(jasoList.get(startIdx));
if (chosungIndex >= 0) {
unicode += JONGSUNG_COUNT * JUNGSUNG_COUNT * chosungIndex;
} else {
//System.out.println((startIdx + 1) + "번째 자소가 한글 초성이 아닙니다 ::: " + jasoList.get(startIdx) );
//throw new HangulParserException((startIdx + 1) + "번째 자소가 한글 초성이 아닙니다");
}
final int jungsungIndex = new String(JUNGSUNG_LIST).indexOf(jasoList.get(startIdx + 1));
if(jungsungIndex >= 0) {
unicode += JONGSUNG_COUNT * jungsungIndex;
} else {
//System.out.println((startIdx + 2) + "번째 자소가 한글 중성이 아닙니다 ::: " + jasoList.get(startIdx));
//throw new HangulParserException((startIdx + 2) + "번째 자소가 한글 중성이 아닙니다");
}
if (assembleSize > 2) {
final int jongsungIndex = new String(JONGSUNG_LIST).indexOf(jasoList.get(startIdx + 2));
if (jongsungIndex >= 0) {
unicode += jongsungIndex;
} else {
//System.out.println((startIdx + 3) + "번째 자소가 한글 종성이 아닙니다 ::: " + jasoList.get(startIdx));
//throw new HangulParserException((startIdx + 3) + "번째 자소가 한글 종성이 아닙니다");
}
}
} catch (Exception e) {
System.out.println("++++++++++++ assemble Error !!! " + e);
}
//System.out.println("unicode ::: "+unicode);
return Character.toString((char) unicode);
}
private static int getNextAssembleSize(List<String> jasoList, final int startIdx) throws Exception {
final int remainJasoLength = jasoList.size() - startIdx;
final int assembleSize;
//문장에서 영문 또는 숫자가 나오면 사이즈를 1로 리턴하여 다음글자를 읽어올수 있게 한다.
if(jasoList.get(startIdx).matches(".*[a-zA-Z0-9]+.*")) {
return assembleSize = 1;
}
//한글 자/모음인경우 글자수 카운트 처리해준다.
if (remainJasoLength > 3) {
if (new String(JUNGSUNG_LIST).contains(jasoList.get(startIdx + 3))) {
assembleSize = 2;
} else {
assembleSize = 3;
}
} else if(remainJasoLength == 3 || remainJasoLength == 2) {
assembleSize = remainJasoLength;
//System.out.println("assembleSize ::: "+assembleSize);
} else {
System.out.println(jasoList.get(startIdx));
System.out.println("한글을 구성할 자소가 부족하거나 한글이 아닌 문자가 있습니다");
assembleSize = 1;
//throw new Exception("한글을 구성할 자소가 부족하거나 한글이 아닌 문자가 있습니다");
}
return assembleSize;
}
}