blob: 87cb0abb6e21689e18269429110eec9f3524ecd3 [file] [log] [blame]
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
import {
sanitizeHtml,
isProbablyHTML,
sanitizeHtmlIfNeeded,
safeHtmlSpan,
removeHTMLTags,
isJsonString,
getParagraphContents,
extractTextFromHTML,
} from './html';
describe('sanitizeHtml', () => {
it('should sanitize the HTML string', () => {
const htmlString = '<script>alert("XSS")</script>';
const sanitizedString = sanitizeHtml(htmlString);
expect(sanitizedString).not.toContain('script');
});
});
describe('isProbablyHTML', () => {
it('should return true if the text contains HTML tags', () => {
const htmlText = '<div>Some HTML content</div>';
const isHTML = isProbablyHTML(htmlText);
expect(isHTML).toBe(true);
});
it('should return false if the text does not contain HTML tags', () => {
const plainText = 'Just a plain text';
const isHTML = isProbablyHTML(plainText);
expect(isHTML).toBe(false);
const trickyText = 'a <= 10 and b > 10';
expect(isProbablyHTML(trickyText)).toBe(false);
});
it('should return false for strings with angle brackets that are not HTML', () => {
// Test case from issue #25561
expect(isProbablyHTML('<abcdef:12345>')).toBe(false);
// Other similar cases
expect(isProbablyHTML('<foo:bar>')).toBe(false);
expect(isProbablyHTML('<123>')).toBe(false);
expect(isProbablyHTML('<test@example.com>')).toBe(false);
expect(isProbablyHTML('<custom-element>')).toBe(false);
// Mathematical expressions
expect(isProbablyHTML('if x < 5 and y > 10')).toBe(false);
expect(isProbablyHTML('price < $100')).toBe(false);
});
});
describe('sanitizeHtmlIfNeeded', () => {
it('should sanitize the HTML string if it contains HTML tags', () => {
const htmlString = '<div>Some <b>HTML</b> content</div>';
const sanitizedString = sanitizeHtmlIfNeeded(htmlString);
expect(sanitizedString).toEqual(htmlString);
});
it('should return the string as is if it does not contain HTML tags', () => {
const plainText = 'Just a plain text';
const sanitizedString = sanitizeHtmlIfNeeded(plainText);
expect(sanitizedString).toEqual(plainText);
});
});
describe('safeHtmlSpan', () => {
it('should return a safe HTML span when the input is HTML', () => {
const htmlString = '<div>Some <b>HTML</b> content</div>';
const safeSpan = safeHtmlSpan(htmlString);
expect(safeSpan).toEqual(
<span
className="safe-html-wrapper"
dangerouslySetInnerHTML={{ __html: htmlString }}
/>,
);
});
it('should return the input string as is when it is not HTML', () => {
const plainText = 'Just a plain text';
const result = safeHtmlSpan(plainText);
expect(result).toEqual(plainText);
});
});
describe('removeHTMLTags', () => {
it('should remove HTML tags from the string', () => {
const input = '<p>Hello, <strong>World!</strong></p>';
const output = removeHTMLTags(input);
expect(output).toBe('Hello, World!');
});
it('should return the same string when no HTML tags are present', () => {
const input = 'This is a plain text.';
const output = removeHTMLTags(input);
expect(output).toBe('This is a plain text.');
});
it('should remove nested HTML tags and return combined text content', () => {
const input = '<div><h1>Title</h1><p>Content</p></div>';
const output = removeHTMLTags(input);
expect(output).toBe('TitleContent');
});
it('should handle self-closing tags and return an empty string', () => {
const input = '<img src="image.png" alt="Image">';
const output = removeHTMLTags(input);
expect(output).toBe('');
});
it('should handle malformed HTML tags and remove only well-formed tags', () => {
const input = '<div><h1>Unclosed tag';
const output = removeHTMLTags(input);
expect(output).toBe('Unclosed tag');
});
});
describe('isJsonString', () => {
it('valid JSON object', () => {
const jsonString = '{"name": "John", "age": 30, "city": "New York"}';
expect(isJsonString(jsonString)).toBe(true);
});
it('valid JSON array', () => {
const jsonString = '[1, 2, 3, 4, 5]';
expect(isJsonString(jsonString)).toBe(true);
});
it('valid JSON string', () => {
const jsonString = '"Hello, world!"';
expect(isJsonString(jsonString)).toBe(true);
});
it('invalid JSON with syntax error', () => {
const jsonString = '{"name": "John", "age": 30, "city": "New York"';
expect(isJsonString(jsonString)).toBe(false);
});
it('empty string', () => {
const jsonString = '';
expect(isJsonString(jsonString)).toBe(false);
});
it('non-JSON string', () => {
const jsonString = '<p>Hello, <strong>World!</strong></p>';
expect(isJsonString(jsonString)).toBe(false);
});
it('non-JSON formatted number', () => {
const jsonString = '12345abc';
expect(isJsonString(jsonString)).toBe(false);
});
});
describe('getParagraphContents', () => {
it('should return an object with keys for each paragraph tag', () => {
const htmlString =
'<div><p>First paragraph.</p><p>Second paragraph.</p></div>';
const result = getParagraphContents(htmlString);
expect(result).toEqual({
p1: 'First paragraph.',
p2: 'Second paragraph.',
});
});
it('should return null if the string is not HTML', () => {
const nonHtmlString = 'Just a plain text string.';
expect(getParagraphContents(nonHtmlString)).toBeNull();
});
it('should return null if there are no <p> tags in the HTML string', () => {
const htmlStringWithoutP = '<div><span>No paragraph here.</span></div>';
expect(getParagraphContents(htmlStringWithoutP)).toBeNull();
});
it('should return an object with empty string for empty <p> tag', () => {
const htmlStringWithEmptyP = '<div><p></p></div>';
const result = getParagraphContents(htmlStringWithEmptyP);
expect(result).toEqual({ p1: '' });
});
it('should handle HTML strings with nested <p> tags correctly', () => {
const htmlStringWithNestedP =
'<div><p>First paragraph <span>with nested</span> content.</p></div>';
const result = getParagraphContents(htmlStringWithNestedP);
expect(result).toEqual({
p1: 'First paragraph with nested content.',
});
});
});
describe('extractTextFromHTML', () => {
it('should extract text from HTML div tags', () => {
const htmlString = '<div>Hello World</div>';
const result = extractTextFromHTML(htmlString);
expect(result).toBe('Hello World');
});
it('should extract text from nested HTML tags', () => {
const htmlString = '<div><p>Hello <strong>World</strong></p></div>';
const result = extractTextFromHTML(htmlString);
expect(result).toBe('Hello World');
});
it('should extract text from multiple HTML elements', () => {
const htmlString = '<h1>Title</h1><p>Content</p><span>Footer</span>';
const result = extractTextFromHTML(htmlString);
expect(result).toBe('TitleContentFooter');
});
it('should return original string when input is not HTML', () => {
const plainText = 'Just plain text';
const result = extractTextFromHTML(plainText);
expect(result).toBe('Just plain text');
});
it('should return original value when input is not a string', () => {
const numberValue = 12345;
const result = extractTextFromHTML(numberValue);
expect(result).toBe(12345);
const nullValue = null;
const nullResult = extractTextFromHTML(nullValue);
expect(nullResult).toBe(null);
const booleanValue = true;
const booleanResult = extractTextFromHTML(booleanValue);
expect(booleanResult).toBe(true);
});
it('should handle empty HTML tags', () => {
const htmlString = '<div></div>';
const result = extractTextFromHTML(htmlString);
expect(result).toBe('');
});
it('should handle HTML with only whitespace', () => {
const htmlString = '<div> </div>';
const result = extractTextFromHTML(htmlString);
expect(result).toBe(' ');
});
it('should extract text from HTML with attributes', () => {
const htmlString = '<div class="container" id="main">Hello World</div>';
const result = extractTextFromHTML(htmlString);
expect(result).toBe('Hello World');
});
it('should handle self-closing tags', () => {
const htmlString = '<img src="image.jpg" alt="Image"><br><p>Text after</p>';
const result = extractTextFromHTML(htmlString);
expect(result).toBe('Text after');
});
it('should handle complex HTML structure', () => {
const htmlString = `
<html>
<head><title>Page Title</title></head>
<body>
<header><h1>Main Title</h1></header>
<main>
<p>First paragraph with <em>emphasis</em>.</p>
<ul>
<li>Item 1</li>
<li>Item 2</li>
</ul>
</main>
</body>
</html>
`;
const result = extractTextFromHTML(htmlString);
expect(result).toContain('Page Title');
expect(result).toContain('Main Title');
expect(result).toContain('First paragraph with emphasis.');
expect(result).toContain('Item 1');
expect(result).toContain('Item 2');
});
it('should not extract text from strings that look like HTML but are not', () => {
const fakeHtmlString = '<abcdef:12345>';
const result = extractTextFromHTML(fakeHtmlString);
expect(result).toBe('<abcdef:12345>');
const mathExpression = 'x < 5 and y > 10';
const mathResult = extractTextFromHTML(mathExpression);
expect(mathResult).toBe('x < 5 and y > 10');
});
});