mirror of
https://gitee.com/dromara/hutool.git
synced 2025-05-02 20:02:49 +08:00
add test and fix comment
This commit is contained in:
parent
6a9feb0576
commit
c6f2754221
@ -7,11 +7,15 @@ import cn.hutool.core.util.StrUtil;
|
||||
/**
|
||||
* HTML工具类
|
||||
*
|
||||
* <p>
|
||||
* 比如我们在使用爬虫爬取HTML页面后,需要对返回页面的HTML内容做一定处理,<br>
|
||||
* 比如去掉指定标签(例如广告栏等)、去除JS、去掉样式等等,这些操作都可以使用此工具类完成。
|
||||
*
|
||||
* @author xiaoleilu
|
||||
*
|
||||
*/
|
||||
public class HtmlUtil {
|
||||
|
||||
|
||||
public static final String NBSP = StrUtil.HTML_NBSP;
|
||||
public static final String AMP = StrUtil.HTML_AMP;
|
||||
public static final String QUOTE = StrUtil.HTML_QUOTE;
|
||||
@ -36,12 +40,12 @@ public class HtmlUtil {
|
||||
TEXT['<'] = LT.toCharArray(); // 小于号
|
||||
TEXT['>'] = GT.toCharArray(); // 大于号
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* 转义文本中的HTML字符为安全的字符,以下字符被转义:
|
||||
* <ul>
|
||||
* <li>' 替换为 &#039; (&apos; doesn't work in HTML4)</li>
|
||||
* <li>" 替换为 &quot;</li>
|
||||
* <li>' 替换为 &#039; (&apos; doesn't work in HTML4)</li>
|
||||
* <li>" 替换为 &quot;</li>
|
||||
* <li>& 替换为 &amp;</li>
|
||||
* <li>< 替换为 &lt;</li>
|
||||
* <li>> 替换为 &gt;</li>
|
||||
@ -64,14 +68,14 @@ public class HtmlUtil {
|
||||
if (StrUtil.isBlank(htmlStr)) {
|
||||
return htmlStr;
|
||||
}
|
||||
|
||||
|
||||
return EscapeUtil.unescapeHtml4(htmlStr);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------- encode text
|
||||
|
||||
/**
|
||||
* 清除所有HTML标签
|
||||
* 清除所有HTML标签,但是不删除标签内的内容
|
||||
*
|
||||
* @param content 文本
|
||||
* @return 清除标签后的文本
|
||||
@ -135,7 +139,7 @@ public class HtmlUtil {
|
||||
}
|
||||
|
||||
/**
|
||||
* 去除HTML标签中的属性
|
||||
* 去除HTML标签中的属性,如果多个标签有相同属性,都去除
|
||||
*
|
||||
* @param content 文本
|
||||
* @param attrs 属性名(不区分大小写)
|
||||
@ -144,6 +148,7 @@ public class HtmlUtil {
|
||||
public static String removeHtmlAttr(String content, String... attrs) {
|
||||
String regex = null;
|
||||
for (String attr : attrs) {
|
||||
// (?i)表示忽略大小写
|
||||
regex = StrUtil.format("(?i)\\s*{}=([\"']).*?\\1", attr);
|
||||
content = content.replaceAll(regex, StrUtil.EMPTY);
|
||||
}
|
||||
|
@ -46,6 +46,39 @@ public class HtmlUtilTest {
|
||||
Assert.assertEquals("pre", result);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void cleanHtmlTagTest() {
|
||||
//非闭合标签
|
||||
String str = "pre<img src=\"xxx/dfdsfds/test.jpg\">";
|
||||
String result = HtmlUtil.cleanHtmlTag(str);
|
||||
Assert.assertEquals("pre", result);
|
||||
|
||||
//闭合标签
|
||||
str = "pre<img>";
|
||||
result = HtmlUtil.cleanHtmlTag(str);
|
||||
Assert.assertEquals("pre", result);
|
||||
|
||||
//闭合标签
|
||||
str = "pre<img src=\"xxx/dfdsfds/test.jpg\" />";
|
||||
result = HtmlUtil.cleanHtmlTag(str);
|
||||
Assert.assertEquals("pre", result);
|
||||
|
||||
//闭合标签
|
||||
str = "pre<img />";
|
||||
result = HtmlUtil.cleanHtmlTag(str);
|
||||
Assert.assertEquals("pre", result);
|
||||
|
||||
//包含内容标签
|
||||
str = "pre<div class=\"test_div\">dfdsfdsfdsf</div>";
|
||||
result = HtmlUtil.cleanHtmlTag(str);
|
||||
Assert.assertEquals("predfdsfdsfdsf", result);
|
||||
|
||||
//带换行
|
||||
str = "pre<div class=\"test_div\">\r\n\t\tdfdsfdsfdsf\r\n</div><div class=\"test_div\">BBBB</div>";
|
||||
result = HtmlUtil.cleanHtmlTag(str);
|
||||
Assert.assertEquals("pre\r\n\t\tdfdsfdsfdsf\r\nBBBB", result);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void unwrapHtmlTagTest() {
|
||||
//非闭合标签
|
||||
@ -83,6 +116,7 @@ public class HtmlUtilTest {
|
||||
public void escapeTest() {
|
||||
String html = "<html><body>123'123'</body></html>";
|
||||
String escape = HtmlUtil.escape(html);
|
||||
Assert.assertEquals("<html><body>123'123'</body></html>", escape);
|
||||
String restoreEscaped = HtmlUtil.unescape(escape);
|
||||
Assert.assertEquals(html, restoreEscaped);
|
||||
}
|
||||
@ -93,4 +127,18 @@ public class HtmlUtilTest {
|
||||
String filter = HtmlUtil.filter(html);
|
||||
Assert.assertEquals("", filter);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void removeHtmlAttrTest() {
|
||||
String html = "<div class=\"test_div\"></div><span class=\"test_div\"></span>";
|
||||
String result = HtmlUtil.removeHtmlAttr(html, "class");
|
||||
Assert.assertEquals("<div></div><span></span>", result);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void removeAllHtmlAttrTest() {
|
||||
String html = "<div class=\"test_div\" width=\"120\"></div>";
|
||||
String result = HtmlUtil.removeAllHtmlAttr(html, "div");
|
||||
Assert.assertEquals("<div></div>", result);
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user