add test and fix comment

This commit is contained in:
Looly 2019-09-11 16:02:54 +08:00
parent 6a9feb0576
commit c6f2754221
2 changed files with 60 additions and 7 deletions

View File

@ -7,11 +7,15 @@ import cn.hutool.core.util.StrUtil;
/**
* HTML工具类
*
* <p>
* 比如我们在使用爬虫爬取HTML页面后需要对返回页面的HTML内容做一定处理<br>
* 比如去掉指定标签例如广告栏等去除JS去掉样式等等这些操作都可以使用此工具类完成
*
* @author xiaoleilu
*
*/
public class HtmlUtil {
public static final String NBSP = StrUtil.HTML_NBSP;
public static final String AMP = StrUtil.HTML_AMP;
public static final String QUOTE = StrUtil.HTML_QUOTE;
@ -36,12 +40,12 @@ public class HtmlUtil {
TEXT['<'] = LT.toCharArray(); // 小于号
TEXT['>'] = GT.toCharArray(); // 大于号
}
/**
* 转义文本中的HTML字符为安全的字符以下字符被转义
* <ul>
* <li>' 替换为 &amp;#039; (&amp;apos; doesn't work in HTML4)</li>
* <li>" 替换为 &amp;quot;</li>
* <li>' 替换为 &amp;#039; (&amp;apos; doesn't work in HTML4)</li>
* <li>" 替换为 &amp;quot;</li>
* <li>&amp; 替换为 &amp;amp;</li>
* <li>&lt; 替换为 &amp;lt;</li>
* <li>&gt; 替换为 &amp;gt;</li>
@ -64,14 +68,14 @@ public class HtmlUtil {
if (StrUtil.isBlank(htmlStr)) {
return htmlStr;
}
return EscapeUtil.unescapeHtml4(htmlStr);
}
// ---------------------------------------------------------------- encode text
/**
* 清除所有HTML标签
* 清除所有HTML标签但是不删除标签内的内容
*
* @param content 文本
* @return 清除标签后的文本
@ -135,7 +139,7 @@ public class HtmlUtil {
}
/**
* 去除HTML标签中的属性
* 去除HTML标签中的属性如果多个标签有相同属性都去除
*
* @param content 文本
* @param attrs 属性名不区分大小写
@ -144,6 +148,7 @@ public class HtmlUtil {
public static String removeHtmlAttr(String content, String... attrs) {
String regex = null;
for (String attr : attrs) {
// (?i)表示忽略大小写
regex = StrUtil.format("(?i)\\s*{}=([\"']).*?\\1", attr);
content = content.replaceAll(regex, StrUtil.EMPTY);
}

View File

@ -46,6 +46,39 @@ public class HtmlUtilTest {
Assert.assertEquals("pre", result);
}
@Test
public void cleanHtmlTagTest() {
//非闭合标签
String str = "pre<img src=\"xxx/dfdsfds/test.jpg\">";
String result = HtmlUtil.cleanHtmlTag(str);
Assert.assertEquals("pre", result);
//闭合标签
str = "pre<img>";
result = HtmlUtil.cleanHtmlTag(str);
Assert.assertEquals("pre", result);
//闭合标签
str = "pre<img src=\"xxx/dfdsfds/test.jpg\" />";
result = HtmlUtil.cleanHtmlTag(str);
Assert.assertEquals("pre", result);
//闭合标签
str = "pre<img />";
result = HtmlUtil.cleanHtmlTag(str);
Assert.assertEquals("pre", result);
//包含内容标签
str = "pre<div class=\"test_div\">dfdsfdsfdsf</div>";
result = HtmlUtil.cleanHtmlTag(str);
Assert.assertEquals("predfdsfdsfdsf", result);
//带换行
str = "pre<div class=\"test_div\">\r\n\t\tdfdsfdsfdsf\r\n</div><div class=\"test_div\">BBBB</div>";
result = HtmlUtil.cleanHtmlTag(str);
Assert.assertEquals("pre\r\n\t\tdfdsfdsfdsf\r\nBBBB", result);
}
@Test
public void unwrapHtmlTagTest() {
//非闭合标签
@ -83,6 +116,7 @@ public class HtmlUtilTest {
public void escapeTest() {
String html = "<html><body>123'123'</body></html>";
String escape = HtmlUtil.escape(html);
Assert.assertEquals("&lt;html&gt;&lt;body&gt;123&#039;123&#039;&lt;/body&gt;&lt;/html&gt;", escape);
String restoreEscaped = HtmlUtil.unescape(escape);
Assert.assertEquals(html, restoreEscaped);
}
@ -93,4 +127,18 @@ public class HtmlUtilTest {
String filter = HtmlUtil.filter(html);
Assert.assertEquals("", filter);
}
@Test
public void removeHtmlAttrTest() {
String html = "<div class=\"test_div\"></div><span class=\"test_div\"></span>";
String result = HtmlUtil.removeHtmlAttr(html, "class");
Assert.assertEquals("<div></div><span></span>", result);
}
@Test
public void removeAllHtmlAttrTest() {
String html = "<div class=\"test_div\" width=\"120\"></div>";
String result = HtmlUtil.removeAllHtmlAttr(html, "div");
Assert.assertEquals("<div></div>", result);
}
}