superset-frontend/packages/superset-ui-core/src/utils/html.test.tsx - superset - Git at Google

 /**
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */
 import {
   sanitizeHtml,
   isProbablyHTML,
   sanitizeHtmlIfNeeded,
   safeHtmlSpan,
   removeHTMLTags,
   isJsonString,
   getParagraphContents,
   extractTextFromHTML,
 } from './html';

 describe('sanitizeHtml', () => {
   it('should sanitize the HTML string', () => {
     const htmlString = '<script>alert("XSS")</script>';
     const sanitizedString = sanitizeHtml(htmlString);
     expect(sanitizedString).not.toContain('script');
   });
 });

 describe('isProbablyHTML', () => {
   it('should return true if the text contains HTML tags', () => {
     const htmlText = '<div>Some HTML content</div>';
     const isHTML = isProbablyHTML(htmlText);
     expect(isHTML).toBe(true);
   });

   it('should return false if the text does not contain HTML tags', () => {
     const plainText = 'Just a plain text';
     const isHTML = isProbablyHTML(plainText);
     expect(isHTML).toBe(false);

     const trickyText = 'a <= 10 and b > 10';
     expect(isProbablyHTML(trickyText)).toBe(false);
   });

   it('should return false for strings with angle brackets that are not HTML', () => {
     // Test case from issue #25561
     expect(isProbablyHTML('<abcdef:12345>')).toBe(false);

     // Other similar cases
     expect(isProbablyHTML('<foo:bar>')).toBe(false);
     expect(isProbablyHTML('<123>')).toBe(false);
     expect(isProbablyHTML('<test@example.com>')).toBe(false);
     expect(isProbablyHTML('<custom-element>')).toBe(false);

     // Mathematical expressions
     expect(isProbablyHTML('if x < 5 and y > 10')).toBe(false);
     expect(isProbablyHTML('price < $100')).toBe(false);
   });
 });

 describe('sanitizeHtmlIfNeeded', () => {
   it('should sanitize the HTML string if it contains HTML tags', () => {
     const htmlString = '<div>Some <b>HTML</b> content</div>';
     const sanitizedString = sanitizeHtmlIfNeeded(htmlString);
     expect(sanitizedString).toEqual(htmlString);
   });

   it('should return the string as is if it does not contain HTML tags', () => {
     const plainText = 'Just a plain text';
     const sanitizedString = sanitizeHtmlIfNeeded(plainText);
     expect(sanitizedString).toEqual(plainText);
   });
 });

 describe('safeHtmlSpan', () => {
   it('should return a safe HTML span when the input is HTML', () => {
     const htmlString = '<div>Some <b>HTML</b> content</div>';
     const safeSpan = safeHtmlSpan(htmlString);
     expect(safeSpan).toEqual(
       <span
         className="safe-html-wrapper"
         dangerouslySetInnerHTML={{ __html: htmlString }}
       />,
     );
   });

   it('should return the input string as is when it is not HTML', () => {
     const plainText = 'Just a plain text';
     const result = safeHtmlSpan(plainText);
     expect(result).toEqual(plainText);
   });
 });

 describe('removeHTMLTags', () => {
   it('should remove HTML tags from the string', () => {
     const input = '<p>Hello, <strong>World!</strong></p>';
     const output = removeHTMLTags(input);
     expect(output).toBe('Hello, World!');
   });

   it('should return the same string when no HTML tags are present', () => {
     const input = 'This is a plain text.';
     const output = removeHTMLTags(input);
     expect(output).toBe('This is a plain text.');
   });

   it('should remove nested HTML tags and return combined text content', () => {
     const input = '<div><h1>Title</h1><p>Content</p></div>';
     const output = removeHTMLTags(input);
     expect(output).toBe('TitleContent');
   });

   it('should handle self-closing tags and return an empty string', () => {
     const input = '<img src="image.png" alt="Image">';
     const output = removeHTMLTags(input);
     expect(output).toBe('');
   });

   it('should handle malformed HTML tags and remove only well-formed tags', () => {
     const input = '<div><h1>Unclosed tag';
     const output = removeHTMLTags(input);
     expect(output).toBe('Unclosed tag');
   });
 });

 describe('isJsonString', () => {
   it('valid JSON object', () => {
     const jsonString = '{"name": "John", "age": 30, "city": "New York"}';
     expect(isJsonString(jsonString)).toBe(true);
   });

   it('valid JSON array', () => {
     const jsonString = '[1, 2, 3, 4, 5]';
     expect(isJsonString(jsonString)).toBe(true);
   });

   it('valid JSON string', () => {
     const jsonString = '"Hello, world!"';
     expect(isJsonString(jsonString)).toBe(true);
   });

   it('invalid JSON with syntax error', () => {
     const jsonString = '{"name": "John", "age": 30, "city": "New York"';
     expect(isJsonString(jsonString)).toBe(false);
   });

   it('empty string', () => {
     const jsonString = '';
     expect(isJsonString(jsonString)).toBe(false);
   });

   it('non-JSON string', () => {
     const jsonString = '<p>Hello, <strong>World!</strong></p>';
     expect(isJsonString(jsonString)).toBe(false);
   });

   it('non-JSON formatted number', () => {
     const jsonString = '12345abc';
     expect(isJsonString(jsonString)).toBe(false);
   });
 });

 describe('getParagraphContents', () => {
   it('should return an object with keys for each paragraph tag', () => {
     const htmlString =
       '<div><p>First paragraph.</p><p>Second paragraph.</p></div>';
     const result = getParagraphContents(htmlString);
     expect(result).toEqual({
       p1: 'First paragraph.',
       p2: 'Second paragraph.',
     });
   });

   it('should return null if the string is not HTML', () => {
     const nonHtmlString = 'Just a plain text string.';
     expect(getParagraphContents(nonHtmlString)).toBeNull();
   });

   it('should return null if there are no <p> tags in the HTML string', () => {
     const htmlStringWithoutP = '<div><span>No paragraph here.</span></div>';
     expect(getParagraphContents(htmlStringWithoutP)).toBeNull();
   });

   it('should return an object with empty string for empty <p> tag', () => {
     const htmlStringWithEmptyP = '<div><p></p></div>';
     const result = getParagraphContents(htmlStringWithEmptyP);
     expect(result).toEqual({ p1: '' });
   });

   it('should handle HTML strings with nested <p> tags correctly', () => {
     const htmlStringWithNestedP =
       '<div><p>First paragraph <span>with nested</span> content.</p></div>';
     const result = getParagraphContents(htmlStringWithNestedP);
     expect(result).toEqual({
       p1: 'First paragraph with nested content.',
     });
   });
 });

 describe('extractTextFromHTML', () => {
   it('should extract text from HTML div tags', () => {
     const htmlString = '<div>Hello World</div>';
     const result = extractTextFromHTML(htmlString);
     expect(result).toBe('Hello World');
   });

   it('should extract text from nested HTML tags', () => {
     const htmlString = '<div><p>Hello <strong>World</strong></p></div>';
     const result = extractTextFromHTML(htmlString);
     expect(result).toBe('Hello World');
   });

   it('should extract text from multiple HTML elements', () => {
     const htmlString = '<h1>Title</h1><p>Content</p><span>Footer</span>';
     const result = extractTextFromHTML(htmlString);
     expect(result).toBe('TitleContentFooter');
   });

   it('should return original string when input is not HTML', () => {
     const plainText = 'Just plain text';
     const result = extractTextFromHTML(plainText);
     expect(result).toBe('Just plain text');
   });

   it('should return original value when input is not a string', () => {
     const numberValue = 12345;
     const result = extractTextFromHTML(numberValue);
     expect(result).toBe(12345);

     const nullValue = null;
     const nullResult = extractTextFromHTML(nullValue);
     expect(nullResult).toBe(null);

     const booleanValue = true;
     const booleanResult = extractTextFromHTML(booleanValue);
     expect(booleanResult).toBe(true);
   });

   it('should handle empty HTML tags', () => {
     const htmlString = '<div></div>';
     const result = extractTextFromHTML(htmlString);
     expect(result).toBe('');
   });

   it('should handle HTML with only whitespace', () => {
     const htmlString = '<div>   </div>';
     const result = extractTextFromHTML(htmlString);
     expect(result).toBe('   ');
   });

   it('should extract text from HTML with attributes', () => {
     const htmlString = '<div class="container" id="main">Hello World</div>';
     const result = extractTextFromHTML(htmlString);
     expect(result).toBe('Hello World');
   });

   it('should handle self-closing tags', () => {
     const htmlString = '<img src="image.jpg" alt="Image"><br><p>Text after</p>';
     const result = extractTextFromHTML(htmlString);
     expect(result).toBe('Text after');
   });

   it('should handle complex HTML structure', () => {
     const htmlString = `
       <html>
         <head><title>Page Title</title></head>
         <body>
           <header><h1>Main Title</h1></header>
           <main>
             <p>First paragraph with <em>emphasis</em>.</p>
             <ul>
               <li>Item 1</li>
               <li>Item 2</li>
             </ul>
           </main>
         </body>
       </html>
     `;
     const result = extractTextFromHTML(htmlString);
     expect(result).toContain('Page Title');
     expect(result).toContain('Main Title');
     expect(result).toContain('First paragraph with emphasis.');
     expect(result).toContain('Item 1');
     expect(result).toContain('Item 2');
   });

   it('should not extract text from strings that look like HTML but are not', () => {
     const fakeHtmlString = '<abcdef:12345>';
     const result = extractTextFromHTML(fakeHtmlString);
     expect(result).toBe('<abcdef:12345>');

     const mathExpression = 'x < 5 and y > 10';
     const mathResult = extractTextFromHTML(mathExpression);
     expect(mathResult).toBe('x < 5 and y > 10');
   });
 });
	/**
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/
	import {
	sanitizeHtml,
	isProbablyHTML,
	sanitizeHtmlIfNeeded,
	safeHtmlSpan,
	removeHTMLTags,
	isJsonString,
	getParagraphContents,
	extractTextFromHTML,
	} from './html';

	describe('sanitizeHtml', () => {
	it('should sanitize the HTML string', () => {
	const htmlString = '<script>alert("XSS")</script>';
	const sanitizedString = sanitizeHtml(htmlString);
	expect(sanitizedString).not.toContain('script');
	});
	});

	describe('isProbablyHTML', () => {
	it('should return true if the text contains HTML tags', () => {
	const htmlText = '<div>Some HTML content</div>';
	const isHTML = isProbablyHTML(htmlText);
	expect(isHTML).toBe(true);
	});

	it('should return false if the text does not contain HTML tags', () => {
	const plainText = 'Just a plain text';
	const isHTML = isProbablyHTML(plainText);
	expect(isHTML).toBe(false);

	const trickyText = 'a <= 10 and b > 10';
	expect(isProbablyHTML(trickyText)).toBe(false);
	});

	it('should return false for strings with angle brackets that are not HTML', () => {
	// Test case from issue #25561
	expect(isProbablyHTML('<abcdef:12345>')).toBe(false);

	// Other similar cases
	expect(isProbablyHTML('<foo:bar>')).toBe(false);
	expect(isProbablyHTML('<123>')).toBe(false);
	expect(isProbablyHTML('<test@example.com>')).toBe(false);
	expect(isProbablyHTML('<custom-element>')).toBe(false);

	// Mathematical expressions
	expect(isProbablyHTML('if x < 5 and y > 10')).toBe(false);
	expect(isProbablyHTML('price < $100')).toBe(false);
	});
	});

	describe('sanitizeHtmlIfNeeded', () => {
	it('should sanitize the HTML string if it contains HTML tags', () => {
	const htmlString = '<div>Some <b>HTML</b> content</div>';
	const sanitizedString = sanitizeHtmlIfNeeded(htmlString);
	expect(sanitizedString).toEqual(htmlString);
	});

	it('should return the string as is if it does not contain HTML tags', () => {
	const plainText = 'Just a plain text';
	const sanitizedString = sanitizeHtmlIfNeeded(plainText);
	expect(sanitizedString).toEqual(plainText);
	});
	});

	describe('safeHtmlSpan', () => {
	it('should return a safe HTML span when the input is HTML', () => {
	const htmlString = '<div>Some <b>HTML</b> content</div>';
	const safeSpan = safeHtmlSpan(htmlString);
	expect(safeSpan).toEqual(
	<span
	className="safe-html-wrapper"
	dangerouslySetInnerHTML={{ __html: htmlString }}
	/>,
	);
	});

	it('should return the input string as is when it is not HTML', () => {
	const plainText = 'Just a plain text';
	const result = safeHtmlSpan(plainText);
	expect(result).toEqual(plainText);
	});
	});

	describe('removeHTMLTags', () => {
	it('should remove HTML tags from the string', () => {
	const input = '<p>Hello, <strong>World!</strong></p>';
	const output = removeHTMLTags(input);
	expect(output).toBe('Hello, World!');
	});

	it('should return the same string when no HTML tags are present', () => {
	const input = 'This is a plain text.';
	const output = removeHTMLTags(input);
	expect(output).toBe('This is a plain text.');
	});

	it('should remove nested HTML tags and return combined text content', () => {
	const input = '<div><h1>Title</h1><p>Content</p></div>';
	const output = removeHTMLTags(input);
	expect(output).toBe('TitleContent');
	});

	it('should handle self-closing tags and return an empty string', () => {
	const input = '<img src="image.png" alt="Image">';
	const output = removeHTMLTags(input);
	expect(output).toBe('');
	});

	it('should handle malformed HTML tags and remove only well-formed tags', () => {
	const input = '<div><h1>Unclosed tag';
	const output = removeHTMLTags(input);
	expect(output).toBe('Unclosed tag');
	});
	});

	describe('isJsonString', () => {
	it('valid JSON object', () => {
	const jsonString = '{"name": "John", "age": 30, "city": "New York"}';
	expect(isJsonString(jsonString)).toBe(true);
	});

	it('valid JSON array', () => {
	const jsonString = '[1, 2, 3, 4, 5]';
	expect(isJsonString(jsonString)).toBe(true);
	});

	it('valid JSON string', () => {
	const jsonString = '"Hello, world!"';
	expect(isJsonString(jsonString)).toBe(true);
	});

	it('invalid JSON with syntax error', () => {
	const jsonString = '{"name": "John", "age": 30, "city": "New York"';
	expect(isJsonString(jsonString)).toBe(false);
	});

	it('empty string', () => {
	const jsonString = '';
	expect(isJsonString(jsonString)).toBe(false);
	});

	it('non-JSON string', () => {
	const jsonString = '<p>Hello, <strong>World!</strong></p>';
	expect(isJsonString(jsonString)).toBe(false);
	});

	it('non-JSON formatted number', () => {
	const jsonString = '12345abc';
	expect(isJsonString(jsonString)).toBe(false);
	});
	});

	describe('getParagraphContents', () => {
	it('should return an object with keys for each paragraph tag', () => {
	const htmlString =
	'<div><p>First paragraph.</p><p>Second paragraph.</p></div>';
	const result = getParagraphContents(htmlString);
	expect(result).toEqual({
	p1: 'First paragraph.',
	p2: 'Second paragraph.',
	});
	});

	it('should return null if the string is not HTML', () => {
	const nonHtmlString = 'Just a plain text string.';
	expect(getParagraphContents(nonHtmlString)).toBeNull();
	});

	it('should return null if there are no <p> tags in the HTML string', () => {
	const htmlStringWithoutP = '<div><span>No paragraph here.</span></div>';
	expect(getParagraphContents(htmlStringWithoutP)).toBeNull();
	});

	it('should return an object with empty string for empty <p> tag', () => {
	const htmlStringWithEmptyP = '<div><p></p></div>';
	const result = getParagraphContents(htmlStringWithEmptyP);
	expect(result).toEqual({ p1: '' });
	});

	it('should handle HTML strings with nested <p> tags correctly', () => {
	const htmlStringWithNestedP =
	'<div><p>First paragraph <span>with nested</span> content.</p></div>';
	const result = getParagraphContents(htmlStringWithNestedP);
	expect(result).toEqual({
	p1: 'First paragraph with nested content.',
	});
	});
	});

	describe('extractTextFromHTML', () => {
	it('should extract text from HTML div tags', () => {
	const htmlString = '<div>Hello World</div>';
	const result = extractTextFromHTML(htmlString);
	expect(result).toBe('Hello World');
	});

	it('should extract text from nested HTML tags', () => {
	const htmlString = '<div><p>Hello <strong>World</strong></p></div>';
	const result = extractTextFromHTML(htmlString);
	expect(result).toBe('Hello World');
	});

	it('should extract text from multiple HTML elements', () => {
	const htmlString = '<h1>Title</h1><p>Content</p><span>Footer</span>';
	const result = extractTextFromHTML(htmlString);
	expect(result).toBe('TitleContentFooter');
	});

	it('should return original string when input is not HTML', () => {
	const plainText = 'Just plain text';
	const result = extractTextFromHTML(plainText);
	expect(result).toBe('Just plain text');
	});

	it('should return original value when input is not a string', () => {
	const numberValue = 12345;
	const result = extractTextFromHTML(numberValue);
	expect(result).toBe(12345);

	const nullValue = null;
	const nullResult = extractTextFromHTML(nullValue);
	expect(nullResult).toBe(null);

	const booleanValue = true;
	const booleanResult = extractTextFromHTML(booleanValue);
	expect(booleanResult).toBe(true);
	});

	it('should handle empty HTML tags', () => {
	const htmlString = '<div></div>';
	const result = extractTextFromHTML(htmlString);
	expect(result).toBe('');
	});

	it('should handle HTML with only whitespace', () => {
	const htmlString = '<div> </div>';
	const result = extractTextFromHTML(htmlString);
	expect(result).toBe(' ');
	});

	it('should extract text from HTML with attributes', () => {
	const htmlString = '<div class="container" id="main">Hello World</div>';
	const result = extractTextFromHTML(htmlString);
	expect(result).toBe('Hello World');
	});

	it('should handle self-closing tags', () => {
	const htmlString = '<img src="image.jpg" alt="Image"><br><p>Text after</p>';
	const result = extractTextFromHTML(htmlString);
	expect(result).toBe('Text after');
	});

	it('should handle complex HTML structure', () => {
	const htmlString = `
	<html>
	<head><title>Page Title</title></head>
	<body>
	<header><h1>Main Title</h1></header>
	<main>
	<p>First paragraph with <em>emphasis</em>.</p>
	<ul>
	<li>Item 1</li>
	<li>Item 2</li>
	</ul>
	</main>
	</body>
	</html>
	`;
	const result = extractTextFromHTML(htmlString);
	expect(result).toContain('Page Title');
	expect(result).toContain('Main Title');
	expect(result).toContain('First paragraph with emphasis.');
	expect(result).toContain('Item 1');
	expect(result).toContain('Item 2');
	});

	it('should not extract text from strings that look like HTML but are not', () => {
	const fakeHtmlString = '<abcdef:12345>';
	const result = extractTextFromHTML(fakeHtmlString);
	expect(result).toBe('<abcdef:12345>');

	const mathExpression = 'x < 5 and y > 10';
	const mathResult = extractTextFromHTML(mathExpression);
	expect(mathResult).toBe('x < 5 and y > 10');
	});
	});