2012-08-28 1 views
0

구문 분석 후 RSS 파일의 문자열이 있습니다.HTML 태그 및 특수 문자없이 j2me의 HTML 텍스트 추출

문자열 htmlString =

<p><img border="1" align="left" width="200" vspace="2" hspace="2" height="133" alt="Prime Minister Manmohan Singh will leave for Iran on August 28, 2012 to attend the Non-Aligned Movement summit, which will be preceded by crucial bilateral talks with Iran&rsquo;s supreme leader Ayotollah Ali Khamenei and Iranian President Mahmoud Ahmadinejad." src="/tmdbuserfiles/manmohan ahmadi(3).jpg" />Prime Minister summit, which will be preceded by crucial bilateral talks with Iran&rsquo;s supreme leader place at a time when the U.S. is pushing India to reduce engagement with Iran and implement sanctions imposed by some countries over its controversial nuclear programme.<br /> 
    <br /> 
    &nbsp;</p> 

내가 좋아하는 내 LWUIT 양식에 htmlString 위에서 HTML 특수 문자를 HTML 태그없이없이 텍스트를 표시하는 요구 사항이 있습니다

Prime Minister ManmohanSingh will leave for Iran on August 28, 2012 to attend the Non-Aligned Movement summit, which will 
be preceded by crucial bilateral talks with Iran supreme leader Ayotollah Ali Khamenei and Iranian etc...........? 

답변

1

을 또한 다음과 같이 UTF-8 인코딩을 사용하여 HttpConnection 입력 스트림을 여는 데 도움이됩니다.

String encoding = "UTF-8"; 
Reader reader = new InputStreamReader(in, encoding); 

깨끗하고 형식있는 텍스트를 얻으려면이 String Utils 제품군을 사용하십시오. :

/** 
* Method removes HTML tags from given string. 
* 
* @param text Input parameter containing HTML tags (eg. <b>cat</b>) 
* @return  String without HTML tags (eg. cat) 
*/ 
public static String removeHtml(String text) { 
    try { 
     int idx = text.indexOf("<"); 
     if (idx == -1) { 
      text = decodeEntities(text); 
      return text; 
     } 

     String plainText = ""; 
     String htmlText = text; 
     int htmlStartIndex = htmlText.indexOf("<", 0); 
     if (htmlStartIndex == -1) { 
      return text; 
     } 
     htmlText = StringUtils.replace(htmlText, "</p>", "\r\n"); 
     htmlText = StringUtils.replace(htmlText, "<br/>", "\r\n"); 
     htmlText = StringUtils.replace(htmlText, "<br>", "\r\n"); 
     while (htmlStartIndex >= 0) { 
      plainText += htmlText.substring(0, htmlStartIndex); 
      int htmlEndIndex = htmlText.indexOf(">", htmlStartIndex); 
      htmlText = htmlText.substring(htmlEndIndex + 1); 
      htmlStartIndex = htmlText.indexOf("<", 0); 
     } 
     plainText = plainText.trim(); 
     plainText = decodeEntities(plainText); 
     return plainText; 
    } catch (Exception e) { 
     System.err.println("Error while removing HTML: " + e.toString()); 
     return text; 
    } 
} 

public static String decodeEntities(String html) { 
    String result = StringUtils.replace(html, "&lt;", "<"); 
    result = StringUtils.replace(result, "&gt;", ">"); 
    result = StringUtils.replace(result, "&nbsp;", " "); 
    result = StringUtils.replace(result, "&amp;", "&"); 
    result = StringUtils.replace(result, "&auml;", "ä"); 
    result = StringUtils.replace(result, "&ouml;", "ö"); 
    result = StringUtils.replace(result, "&quot;", "'"); 
    result = StringUtils.replace(result, "&lquot;", "'"); 
    result = StringUtils.replace(result, "&rquot;", "'"); 
    result = StringUtils.replace(result, "&#xd;", "\r"); 
    return result; 
} 

/* Replace all instances of a String in a String. 
* @param s String to alter. 
* @param f String to look for. 
* @param r String to replace it with, or null to just remove it. 
*/ 
public static String replace(String s, String f, String r) { 
    if (s == null) { 
     return s; 
    } 
    if (f == null) { 
     return s; 
    } 
    if (r == null) { 
     r = ""; 
    } 
    int index01 = s.indexOf(f); 
    while (index01 != -1) { 
     s = s.substring(0, index01) + r + s.substring(index01 + f.length()); 
     index01 += r.length(); 
     index01 = s.indexOf(f, index01); 
    } 
    return s; 
} 

public static String cleanEncodedString(String str) { 
    String resultStr = str; 
    String encoding = "UTF-8"; 

    InputStream in = new ByteArrayInputStream(str.getBytes()); 
    InputStreamReader isr; 
    try { 
     isr = new InputStreamReader(in, encoding); 

     ByteArrayOutputStream buf = new ByteArrayOutputStream(); 
     int result = isr.read(); 
     while (result != -1) { 
      byte b = (byte) result; 
      buf.write(b); 
      result = isr.read(); 
     } 
     resultStr = buf.toString(); 

     return resultStr; 
    } catch (Exception uee) { 
     uee.printStackTrace(); 
    } 
    return resultStr; 
} 
1
int startIndex = htmlString.indexOf("alt=\""); 
    int endIndex = htmlString.indexOf("\"", startIndex+5); 
    String resultantString = htmlString.substring(startIndex+5, endIndex); 
    resultantString = replaceAll(resultantString ,"&quot;","\""); 
    resultantString = replaceAll(resultantString ,"&amp;","&"); 
    resultantString = replaceAll(resultantString ,"&rsquo;","’"); 




private String replaceAll(String source, String pattern, String replacement) { 
     if (source == null) { 
      return ""; 
     } 
     StringBuffer sb = new StringBuffer(); 
     int index; 
     int patIndex = 0; 
     while ((index = source.indexOf(pattern, patIndex)) != -1) { 
      sb.append(source.substring(patIndex, index)); 
      sb.append(replacement); 
      patIndex = index + pattern.length(); 
     } 
     sb.append(source.substring(patIndex)); 
     return sb.toString(); 
    }