Java按字节数截取字符串,一个中文长度为2
碰到可能会截取汉字的情况,当然是要不能截取出乱码来,就是不能对整个汉字截取一半。如"我ABC汉字d"这个字符串,截取5个字节的时候,应该是"我 ABC",而截取8个字节的时候,应该是"我ABC汉",而不应该是"我ABC汉?",其中"?"为半个汉字,可理解为向前截取
public static String subStr(String str, int subSLength)
throws UnsupportedEncodingException{
if (str == null)
return "";
else{
int tempSubLength = subSLength;//截取字节数
String subStr = str.substring(0, str.length()<subSLength ? str.length() : subSLength);//截取的子串
int subStrByetsL = subStr.getBytes("GBK").length;//截取子串的字节长度
//int subStrByetsL = subStr.getBytes().length;//截取子串的字节长度
// 说明截取的字符串中包含有汉字
while (subStrByetsL > tempSubLength){
int subSLengthTemp = --subSLength;
subStr = str.substring(0, subSLengthTemp>str.length() ? str.length() : subSLengthTemp);
subStrByetsL = subStr.getBytes("GBK").length;
//subStrByetsL = subStr.getBytes().length;
}
return subStr;
}
}
备注:将字符编码GBK改为UTF-8,则每个中文长度按3个字符计算
以下方法是向后截取字符串
public static String subStr_1(String str, int start, int end)
throws UnsupportedEncodingException{
if (str == null) return null;
String chinese = "\[\\u0391-\\uFFE5\]";
byte\[\] b = str.getBytes("UTF-8");
String temp = new String(b, start, end);
String last = getLastStr(temp);
while(!last.matches(chinese)){
temp = new String(b, start, ++end);
last = getLastStr(temp);
}
return new String(b, start, end);
}
public static String getByteStr(String str, int start, int end) throws UnsupportedEncodingException{
byte\[\] b = str.getBytes("UTF-8");
return new String(b, start, end);
}