PK
foo
tag is never designated as an empty-element tag. Even if the markup shows it as an empty-element tag, it shouldn't be presented that way. """ soup = self.soup("
", "
foobaz
and tags, even if that would mean not prettifying the markup. """ pre_markup = " " textarea_markup = " woo\nwoo " self.assertSoupEquals(pre_markup) self.assertSoupEquals(textarea_markup) soup = self.soup(pre_markup) self.assertEqual(soup.pre.prettify(), pre_markup) soup = self.soup(textarea_markup) self.assertEqual(soup.textarea.prettify(), textarea_markup) soup = self.soup("") self.assertEqual(soup.textarea.prettify(), "") def test_nested_inline_elements(self): """Inline elements can be nested indefinitely.""" b_tag = "Inside a B tag" self.assertSoupEquals(b_tag) nested_b_tag = "A nested tag" self.assertSoupEquals(nested_b_tag) double_nested_b_tag = "A doubly nested tag" self.assertSoupEquals(nested_b_tag) def test_nested_block_level_elements(self): """Block elements can be nested.""" soup = self.soup('Foo') blockquote = soup.blockquote self.assertEqual(blockquote.p.b.string, 'Foo') self.assertEqual(blockquote.b.string, 'Foo') def test_correctly_nested_tables(self): """One table can go inside another one.""" markup = ('' '' "Here's another table:" '' 'foo' '') self.assertSoupEquals( markup, 'Here\'s another table:' 'foo' '') self.assertSoupEquals( "Foo" "Bar" "Baz") def test_deeply_nested_multivalued_attribute(self): # html5lib can set the attributes of the same tag many times # as it rearranges the tree. This has caused problems with # multivalued attributes. markup = '' soup = self.soup(markup) self.assertEqual(["css"], soup.div.div['class']) def test_multivalued_attribute_on_html(self): # html5lib uses a different API to set the attributes ot the # tag. This has caused problems with multivalued # attributes. markup = '' soup = self.soup(markup) self.assertEqual(["a", "b"], soup.html['class']) def test_angle_brackets_in_attribute_values_are_escaped(self): self.assertSoupEquals('', '') def test_entities_in_attributes_converted_to_unicode(self): expect = '' self.assertSoupEquals('', expect) self.assertSoupEquals('', expect) self.assertSoupEquals('', expect) self.assertSoupEquals('', expect) def test_entities_in_text_converted_to_unicode(self): expect = 'pi\N{LATIN SMALL LETTER N WITH TILDE}ata' self.assertSoupEquals("piñata", expect) self.assertSoupEquals("piñata", expect) self.assertSoupEquals("piñata", expect) self.assertSoupEquals("piñata", expect) def test_quot_entity_converted_to_quotation_mark(self): self.assertSoupEquals("I said "good day!"", 'I said "good day!"') def test_out_of_range_entity(self): expect = "\N{REPLACEMENT CHARACTER}" self.assertSoupEquals("", expect) self.assertSoupEquals("", expect) self.assertSoupEquals("", expect) def test_multipart_strings(self): "Mostly to prevent a recurrence of a bug in the html5lib treebuilder." soup = self.soup("\nfoo") self.assertEqual("p", soup.h2.string.next_element.name) self.assertEqual("p", soup.p.name) self.assertConnectedness(soup) def test_head_tag_between_head_and_body(self): "Prevent recurrence of a bug in the html5lib treebuilder." content = """ foo """ soup = self.soup(content) self.assertNotEqual(None, soup.html.body) self.assertConnectedness(soup) def test_multiple_copies_of_a_tag(self): "Prevent recurrence of a bug in the html5lib treebuilder." content = """ """ soup = self.soup(content) self.assertConnectedness(soup.article) def test_basic_namespaces(self): """Parsers don't need to *understand* namespaces, but at the very least they should not choke on namespaces or lose data.""" markup = b'4' soup = self.soup(markup) self.assertEqual(markup, soup.encode()) html = soup.html self.assertEqual('http://www.w3.org/1999/xhtml', soup.html['xmlns']) self.assertEqual( 'http://www.w3.org/1998/Math/MathML', soup.html['xmlns:mathml']) self.assertEqual( 'http://www.w3.org/2000/svg', soup.html['xmlns:svg']) def test_multivalued_attribute_value_becomes_list(self): markup = b'' soup = self.soup(markup) self.assertEqual(['foo', 'bar'], soup.a['class']) # # Generally speaking, tests below this point are more tests of # Beautiful Soup than tests of the tree builders. But parsers are # weird, so we run these tests separately for every tree builder # to detect any differences between them. # def test_can_parse_unicode_document(self): # A seemingly innocuous document... but it's in Unicode! And # it contains characters that can't be represented in the # encoding found in the declaration! The horror! markup = 'Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!' soup = self.soup(markup) self.assertEqual('Sacr\xe9 bleu!', soup.body.string) def test_soupstrainer(self): """Parsers should be able to work with SoupStrainers.""" strainer = SoupStrainer("b") soup = self.soup("A bold statement", parse_only=strainer) self.assertEqual(soup.decode(), "bold") def test_single_quote_attribute_values_become_double_quotes(self): self.assertSoupEquals("", '') def test_attribute_values_with_nested_quotes_are_left_alone(self): text = """a""" self.assertSoupEquals(text) def test_attribute_values_with_double_nested_quotes_get_quoted(self): text = """a""" soup = self.soup(text) soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"' self.assertSoupEquals( soup.foo.decode(), """a""") def test_ampersand_in_attribute_value_gets_escaped(self): self.assertSoupEquals('', '') self.assertSoupEquals( 'foo', 'foo') def test_escaped_ampersand_in_attribute_value_is_left_alone(self): self.assertSoupEquals('') def test_entities_in_strings_converted_during_parsing(self): # Both XML and HTML entities are converted to Unicode characters # during parsing. text = "<<sacré bleu!>>" expected = "<<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>" self.assertSoupEquals(text, expected) def test_smart_quotes_converted_on_the_way_in(self): # Microsoft smart quotes are converted to Unicode characters during # parsing. quote = b"\x91Foo\x92" soup = self.soup(quote) self.assertEqual( soup.p.string, "\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}") def test_non_breaking_spaces_converted_on_the_way_in(self): soup = self.soup(" ") self.assertEqual(soup.a.string, "\N{NO-BREAK SPACE}" * 2) def test_entities_converted_on_the_way_out(self): text = "<<sacré bleu!>>" expected = "<<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>".encode("utf-8") soup = self.soup(text) self.assertEqual(soup.p.encode("utf-8"), expected) def test_real_iso_latin_document(self): # Smoke test of interrelated functionality, using an # easy-to-understand document. # Here it is in Unicode. Note that it claims to be in ISO-Latin-1. unicode_html = 'Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!' # That's because we're going to encode it into ISO-Latin-1, and use # that to test. iso_latin_html = unicode_html.encode("iso-8859-1") # Parse the ISO-Latin-1 HTML. soup = self.soup(iso_latin_html) # Encode it to UTF-8. result = soup.encode("utf-8") # What do we expect the result to look like? Well, it would # look like unicode_html, except that the META tag would say # UTF-8 instead of ISO-Latin-1. expected = unicode_html.replace("ISO-Latin-1", "utf-8") # And, of course, it would be in UTF-8, not Unicode. expected = expected.encode("utf-8") # Ta-da! self.assertEqual(result, expected) def test_real_shift_jis_document(self): # Smoke test to make sure the parser can handle a document in # Shift-JIS encoding, without choking. shift_jis_html = ( b'' b'\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f' b'\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c' b'\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B' b'') unicode_html = shift_jis_html.decode("shift-jis") soup = self.soup(unicode_html) # Make sure the parse tree is correctly encoded to various # encodings. self.assertEqual(soup.encode("utf-8"), unicode_html.encode("utf-8")) self.assertEqual(soup.encode("euc_jp"), unicode_html.encode("euc_jp")) def test_real_hebrew_document(self): # A real-world test to make sure we can convert ISO-8859-9 (a # Hebrew encoding) to UTF-8. hebrew_document = b'Hebrew (ISO 8859-8) in Visual DirectionalityHebrew (ISO 8859-8) in Visual Directionality\xed\xe5\xec\xf9' soup = self.soup( hebrew_document, from_encoding="iso8859-8") # Some tree builders call it iso8859-8, others call it iso-8859-9. # That's not a difference we really care about. assert soup.original_encoding in ('iso8859-8', 'iso-8859-8') self.assertEqual( soup.encode('utf-8'), hebrew_document.decode("iso8859-8").encode("utf-8")) def test_meta_tag_reflects_current_encoding(self): # Here's the tag saying that a document is # encoded in Shift-JIS. meta_tag = ('') # Here's a document incorporating that meta tag. shift_jis_html = ( '\n%s\n' '' 'Shift-JIS markup goes here.') % meta_tag soup = self.soup(shift_jis_html) # Parse the document, and the charset is seemingly unaffected. parsed_meta = soup.find('meta', {'http-equiv': 'Content-type'}) content = parsed_meta['content'] self.assertEqual('text/html; charset=x-sjis', content) # But that value is actually a ContentMetaAttributeValue object. self.assertTrue(isinstance(content, ContentMetaAttributeValue)) # And it will take on a value that reflects its current # encoding. self.assertEqual('text/html; charset=utf8', content.encode("utf8")) # For the rest of the story, see TestSubstitutions in # test_tree.py. def test_html5_style_meta_tag_reflects_current_encoding(self): # Here's the tag saying that a document is # encoded in Shift-JIS. meta_tag = ('') # Here's a document incorporating that meta tag. shift_jis_html = ( '\n%s\n' '' 'Shift-JIS markup goes here.') % meta_tag soup = self.soup(shift_jis_html) # Parse the document, and the charset is seemingly unaffected. parsed_meta = soup.find('meta', id="encoding") charset = parsed_meta['charset'] self.assertEqual('x-sjis', charset) # But that value is actually a CharsetMetaAttributeValue object. self.assertTrue(isinstance(charset, CharsetMetaAttributeValue)) # And it will take on a value that reflects its current # encoding. self.assertEqual('utf8', charset.encode("utf8")) def test_tag_with_no_attributes_can_have_attributes_added(self): data = self.soup("text") data.a['foo'] = 'bar' self.assertEqual('text', data.a.decode()) class XMLTreeBuilderSmokeTest(object): def test_pickle_and_unpickle_identity(self): # Pickling a tree, then unpickling it, yields a tree identical # to the original. tree = self.soup("foo") dumped = pickle.dumps(tree, 2) loaded = pickle.loads(dumped) self.assertEqual(loaded.__class__, BeautifulSoup) self.assertEqual(loaded.decode(), tree.decode()) def test_docstring_generated(self): soup = self.soup("") self.assertEqual( soup.encode(), b'\n') def test_xml_declaration(self): markup = b"""\n""" soup = self.soup(markup) self.assertEqual(markup, soup.encode("utf8")) def test_processing_instruction(self): markup = b"""\n""" soup = self.soup(markup) self.assertEqual(markup, soup.encode("utf8")) def test_real_xhtml_document(self): """A real XHTML document should come out *exactly* the same as it went in.""" markup = b""" Hello. Goodbye. """ soup = self.soup(markup) self.assertEqual( soup.encode("utf-8"), markup) def test_formatter_processes_script_tag_for_xml_documents(self): doc = """ """ soup = BeautifulSoup(doc, "lxml-xml") # lxml would have stripped this while parsing, but we can add # it later. soup.script.string = 'console.log("< < hey > > ");' encoded = soup.encode() self.assertTrue(b"< < hey > >" in encoded) def test_can_parse_unicode_document(self): markup = 'Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!' soup = self.soup(markup) self.assertEqual('Sacr\xe9 bleu!', soup.root.string) def test_popping_namespaced_tag(self): markup = 'b2012-07-02T20:33:42Zcd' soup = self.soup(markup) self.assertEqual( str(soup.rss), markup) def test_docstring_includes_correct_encoding(self): soup = self.soup("") self.assertEqual( soup.encode("latin1"), b'\n') def test_large_xml_document(self): """A large XML document should come out the same as it went in.""" markup = (b'\n' + b'0' * (2**12) + b'') soup = self.soup(markup) self.assertEqual(soup.encode("utf-8"), markup) def test_tags_are_empty_element_if_and_only_if_they_are_empty(self): self.assertSoupEquals("", "") self.assertSoupEquals("foo") def test_namespaces_are_preserved(self): markup = 'This tag is in the a namespaceThis tag is in the b namespace' soup = self.soup(markup) root = soup.root self.assertEqual("http://example.com/", root['xmlns:a']) self.assertEqual("http://example.net/", root['xmlns:b']) def test_closing_namespaced_tag(self): markup = '20010504' soup = self.soup(markup) self.assertEqual(str(soup.p), markup) def test_namespaced_attributes(self): markup = '' soup = self.soup(markup) self.assertEqual(str(soup.foo), markup) def test_namespaced_attributes_xml_namespace(self): markup = 'bar' soup = self.soup(markup) self.assertEqual(str(soup.foo), markup) class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest): """Smoke test for a tree builder that supports HTML5.""" def test_real_xhtml_document(self): # Since XHTML is not HTML5, HTML5 parsers are not tested to handle # XHTML documents in any particular way. pass def test_html_tags_have_namespace(self): markup = "" soup = self.soup(markup) self.assertEqual("http://www.w3.org/1999/xhtml", soup.a.namespace) def test_svg_tags_have_namespace(self): markup = '' soup = self.soup(markup) namespace = "http://www.w3.org/2000/svg" self.assertEqual(namespace, soup.svg.namespace) self.assertEqual(namespace, soup.circle.namespace) def test_mathml_tags_have_namespace(self): markup = '5' soup = self.soup(markup) namespace = 'http://www.w3.org/1998/Math/MathML' self.assertEqual(namespace, soup.math.namespace) self.assertEqual(namespace, soup.msqrt.namespace) def test_xml_declaration_becomes_comment(self): markup = '' soup = self.soup(markup) self.assertTrue(isinstance(soup.contents[0], Comment)) self.assertEqual(soup.contents[0], '?xml version="1.0" encoding="utf-8"?') self.assertEqual("html", soup.contents[0].next_element.name) def skipIf(condition, reason): def nothing(test, *args, **kwargs): return None def decorator(test_item): if condition: return nothing else: return test_item return decorator Simpan
A nested tag
A doubly nested tag
Foo
pi\N{LATIN SMALL LETTER N WITH TILDE}ata
piñata
I said "good day!"
<<sacré bleu!>>
<<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>
\x91Foo\x92
Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!
' b'\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f' b'\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c' b'\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B' b'
20010504