mirror of
				https://github.com/nkanaev/yarr.git
				synced 2025-10-31 06:53:30 +00:00 
			
		
		
		
	remove whitespace in extracttext
This commit is contained in:
		| @@ -2,11 +2,14 @@ package htmlutil | ||||
|  | ||||
| import ( | ||||
| 	"bytes" | ||||
| 	"regexp" | ||||
| 	"strings" | ||||
|  | ||||
| 	"golang.org/x/net/html" | ||||
| ) | ||||
|  | ||||
| var whitespaceRegex = regexp.MustCompile(`[\s]+`) | ||||
|  | ||||
| func HTML(node *html.Node) string { | ||||
| 	writer := strings.Builder{} | ||||
| 	html.Render(&writer, node) | ||||
| @@ -53,5 +56,8 @@ func ExtractText(content string) string { | ||||
| 			buffer.WriteString(html.UnescapeString(string(tokenizer.Text()))) | ||||
| 		} | ||||
| 	} | ||||
| 	return buffer.String() | ||||
| 	text := buffer.String() | ||||
| 	text = strings.TrimSpace(text) | ||||
| 	text = whitespaceRegex.ReplaceAllLiteralString(text, " ") | ||||
| 	return text | ||||
| } | ||||
|   | ||||
							
								
								
									
										26
									
								
								src/content/htmlutil/utils_test.go
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										26
									
								
								src/content/htmlutil/utils_test.go
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,26 @@ | ||||
| package htmlutil | ||||
|  | ||||
| import "testing" | ||||
|  | ||||
| func TestExtractText(t *testing.T) { | ||||
| 	testcases := [][2]string { | ||||
| 		{"hello", "<div>hello</div>"}, | ||||
| 		{"hello world", "<div>hello</div> world"}, | ||||
| 		{"helloworld", "<div>hello</div>world"}, | ||||
| 		{"hello world", "hello <div>world</div>"}, | ||||
| 		{"helloworld", "hello<div>world</div>"}, | ||||
| 		{"hello world!", "hello <div>world</div>!"}, | ||||
| 		{"hello world !", "hello <div>   world\r\n </div>!"}, | ||||
| 	} | ||||
| 	for _, testcase := range testcases { | ||||
| 		want := testcase[0] | ||||
| 		base := testcase[1] | ||||
| 		have := ExtractText(base) | ||||
| 		if want != have { | ||||
| 			t.Logf("base: %#v\n", base) | ||||
| 			t.Logf("want: %#v\n", want) | ||||
| 			t.Logf("have: %#v\n", have) | ||||
| 			t.Fail() | ||||
| 		} | ||||
| 	} | ||||
| } | ||||
		Reference in New Issue
	
	Block a user