Soup API Reference¶

Auto-generated API documentation for the soup module.

`soup` ¶

HTML parser with BeautifulSoup-like API — zero-dep, stdlib only, Python 3.10+.

Provides a lightweight DOM tree built on top of html.parser.HTMLParser. Supports find, find_all, select, select_one, get_text, decompose, and find_parent — the subset of BeautifulSoup used by the vast majority of real-world scraping scripts.

Supports CSS pseudo-selectors: :first-child, :last-child, :only-child, and :not(selector).

Does NOT implement: .prettify(), .stripped_strings, .descendants iterator, .next_sibling / .previous_sibling, NavigableString class, multiple parser backends.

Example::

soup = Soup("<html><body><p class='msg'>Hello</p></body></html>")
print(soup.find("p", class_="msg").text)
# Hello

`Tag` ¶

A single HTML element node.

Attributes:

Name	Type	Description
`name`	`str`	Tag name (e.g. `"div"`).
`attrs`	`dict[str, str \| list[str]]`	Dictionary of attribute name to value. The `class` attribute is stored as a list of class names; all others as `str`.
`children`	`list[Tag \| str]`	Ordered child nodes — either `Tag` or plain `str`.
`parent`	`Tag \| None`	Parent `Tag`, or `None` for the root document.

Source code in soup/soup.py

class Tag:
    """A single HTML element node.

    Attributes:
        name: Tag name (e.g. ``"div"``).
        attrs: Dictionary of attribute name to value.  The ``class`` attribute
            is stored as a **list** of class names; all others as ``str``.
        children: Ordered child nodes — either ``Tag`` or plain ``str``.
        parent: Parent ``Tag``, or ``None`` for the root document.
    """

    __slots__ = ("name", "attrs", "children", "parent")

    def __init__(
        self,
        name: str,
        attrs: dict[str, str | list[str]] | None = None,
        parent: Tag | None = None,
    ) -> None:
        self.name: str = name
        self.attrs: dict[str, str | list[str]] = attrs if attrs is not None else {}
        self.children: list[Tag | str] = []
        self.parent: Tag | None = parent

    # ── Attribute access ──────────────────────────────────────────────────

    def get(self, attr: str, default: Any = None) -> Any:
        """Return attribute value, or *default* if not present."""
        return self.attrs.get(attr, default)

    def __getitem__(self, attr: str) -> Any:
        """Return attribute value; raise ``KeyError`` if missing."""
        return self.attrs[attr]

    def __contains__(self, attr: str) -> bool:
        return attr in self.attrs

    def __setitem__(self, attr: str, value: Any) -> None:
        """Set an attribute value (e.g. ``tag['id'] = 'main'``)."""
        self.attrs[attr] = value

    def __delitem__(self, attr: str) -> None:
        """Delete an attribute (e.g. ``del tag['id']``)."""
        del self.attrs[attr]

    # ── Text helpers ──────────────────────────────────────────────────────

    @property
    def text(self) -> str:
        """Concatenated text content of this element and all descendants."""
        return self.get_text()

    @property
    def string(self) -> str | None:
        """If this element has exactly one text child (possibly nested), return it.

        Returns ``None`` when the element has no children, multiple children,
        or a mix of text and tags.
        """
        # Direct single-text child
        if len(self.children) == 1:
            child = self.children[0]
            if isinstance(child, str):
                return child
            return child.string
        # No children or multiple children -> None
        return None

    def get_text(self, separator: str = "", strip: bool = False) -> str:
        """Return all text under this element concatenated.

        Args:
            separator: Inserted between text fragments.
            strip: If ``True`` each fragment is whitespace-stripped and empty
                fragments are dropped.

        Returns:
            The combined text.
        """
        parts: list[str] = []
        self._collect_text(parts)
        if strip:
            parts = [p.strip() for p in parts]
            parts = [p for p in parts if p]
        return separator.join(parts)

    def _collect_text(self, acc: list[str]) -> None:
        for child in self.children:
            if isinstance(child, str):
                acc.append(child)
            else:
                child._collect_text(acc)

    # ── Tree modification ─────────────────────────────────────────────────

    def append(self, child: Tag | str) -> None:
        """Append *child* to this element's children.

        If *child* is a ``Tag`` already attached to a parent, it is first
        detached from its old position.

        Args:
            child: A ``Tag`` or plain text string to append.
        """
        if isinstance(child, Tag):
            if child.parent is not None:
                try:
                    child.parent.children.remove(child)
                except ValueError:
                    pass
            child.parent = self
        self.children.append(child)

    def insert(self, index: int, child: Tag | str) -> None:
        """Insert *child* at *index* in this element's children.

        Args:
            index: Position to insert at (same semantics as ``list.insert``).
            child: A ``Tag`` or plain text string to insert.
        """
        if isinstance(child, Tag):
            if child.parent is not None:
                try:
                    child.parent.children.remove(child)
                except ValueError:
                    pass
            child.parent = self
        self.children.insert(index, child)

    def extract(self) -> Tag:
        """Remove this element from its parent but keep its content intact.

        Unlike ``decompose``, the element and its subtree remain usable
        after extraction.

        Returns:
            This element (now detached).
        """
        if self.parent is not None:
            try:
                self.parent.children.remove(self)
            except ValueError:
                pass
            self.parent = None
        return self

    def replace_with(self, new_node: Tag | str) -> Tag:
        """Replace this element with *new_node* in the parent's children.

        Args:
            new_node: The replacement ``Tag`` or text string.

        Returns:
            This element (now detached).

        Raises:
            ValueError: If the element has no parent.
        """
        if self.parent is None:
            raise ValueError("Cannot replace a detached element")
        parent = self.parent
        for i, child in enumerate(parent.children):
            if child is self:
                parent.children[i] = new_node
                if isinstance(new_node, Tag):
                    if new_node.parent is not None:
                        try:
                            new_node.parent.children.remove(new_node)
                        except ValueError:
                            pass
                    new_node.parent = parent
                self.parent = None
                return self
        raise ValueError("Element not found in parent's children")  # pragma: no cover

    def unwrap(self) -> None:
        """Remove this tag but keep its children (re-parent them).

        The children are spliced into the parent's children list at the
        position formerly occupied by this element.
        """
        if self.parent is None:
            return
        parent = self.parent
        idx = next(i for i, c in enumerate(parent.children) if c is self)
        # Splice children into parent at the position of this element.
        for child in self.children:
            if isinstance(child, Tag):
                child.parent = parent
        parent.children[idx : idx + 1] = self.children
        self.children = []
        self.parent = None

    def decompose(self) -> None:
        """Remove this element from its parent and discard its content."""
        if self.parent is not None:
            try:
                self.parent.children.remove(self)
            except ValueError:
                pass
            self.parent = None
        self.children.clear()

    # ── Searching ─────────────────────────────────────────────────────────

    def find(
        self,
        name: str | list[str] | None = None,
        attrs: dict[str, str | bool] | None = None,
        *,
        class_: str | None = None,
        **kwargs: str | bool,
    ) -> Tag | None:
        """Return the first descendant matching the criteria, or ``None``.

        Args:
            name: Tag name(s) to match. ``None`` matches any tag.
            attrs: Dict of attribute filters.
            class_: Shorthand for ``attrs={"class": value}``.
            **kwargs: Extra attribute filters (``href=True`` means *has* href).

        Returns:
            The first matching ``Tag``, or ``None``.
        """
        results = self.find_all(name, attrs, class_=class_, limit=1, **kwargs)
        return results[0] if results else None

    def find_all(
        self,
        name: str | list[str] | None = None,
        attrs: dict[str, str | bool] | None = None,
        *,
        class_: str | None = None,
        limit: int | None = None,
        **kwargs: str | bool,
    ) -> list[Tag]:
        """Return all descendants matching the criteria.

        Args:
            name: Tag name(s) to match.
            attrs: Dict of attribute filters.
            class_: Shorthand for ``attrs={"class": value}``.
            limit: Stop after finding this many results.
            **kwargs: Extra attribute filters.

        Returns:
            A list of matching ``Tag`` objects.
        """
        merged = dict(attrs) if attrs else {}
        if class_ is not None:
            merged["class"] = class_
        merged.update(kwargs)

        # Fast path: name-only search with no attribute filters.
        if not merged:
            if isinstance(name, str):
                results: list[Tag] = []
                self._search_by_single_name(name, results, limit)
                return results
            if isinstance(name, list):
                name_set: frozenset[str] = frozenset(name)
                results = []
                self._search_by_name_set(name_set, results, limit)
                return results

        if isinstance(name, list):
            name_set = frozenset(name)
        else:
            name_set = None  # type: ignore[assignment]

        results = []
        self._search(name, name_set, merged, results, limit)
        return results

    def __call__(self, *args: Any, **kwargs: Any) -> list[Tag]:
        """Calling a tag is equivalent to ``find_all``."""
        return self.find_all(*args, **kwargs)

    def _search(
        self,
        name: str | list[str] | None,
        name_set: frozenset[str] | None,
        attr_filters: dict[str, str | bool],
        results: list[Tag],
        limit: int | None,
    ) -> None:
        for child in self.children:
            if limit is not None and len(results) >= limit:
                return
            if isinstance(child, Tag):
                if _matches(child, name, name_set, attr_filters):
                    results.append(child)
                    if limit is not None and len(results) >= limit:
                        return
                child._search(name, name_set, attr_filters, results, limit)

    def _search_by_name_set(
        self,
        name_set: frozenset[str],
        results: list[Tag],
        limit: int | None,
    ) -> None:
        """Fast path for searching by a set of tag names with no attr filters."""
        for child in self.children:
            if limit is not None and len(results) >= limit:
                return
            if isinstance(child, Tag):
                if child.name in name_set:
                    results.append(child)
                    if limit is not None and len(results) >= limit:
                        return
                child._search_by_name_set(name_set, results, limit)

    def _search_by_single_name(
        self,
        name: str,
        results: list[Tag],
        limit: int | None,
    ) -> None:
        """Fast path for searching by a single tag name with no attr filters."""
        for child in self.children:
            if limit is not None and len(results) >= limit:
                return
            if isinstance(child, Tag):
                if child.name == name:
                    results.append(child)
                    if limit is not None and len(results) >= limit:
                        return
                child._search_by_single_name(name, results, limit)

    # ── find_parent ───────────────────────────────────────────────────────

    def find_parent(self, name: str | None = None) -> Tag | None:
        """Walk up the tree and return the first ancestor matching *name*.

        Args:
            name: Tag name to match. ``None`` returns the immediate parent.

        Returns:
            The matching ancestor ``Tag``, or ``None``.
        """
        node = self.parent
        if name is None:
            return node
        while node is not None:
            if node.name == name:
                return node
            node = node.parent
        return None

    # ── CSS selectors ─────────────────────────────────────────────────────

    def select(self, css_selector: str) -> list[Tag]:
        """Return all descendants matching a CSS selector (simple subset).

        Supported patterns: ``tag``, ``.class``, ``#id``, ``[attr]``,
        ``[attr="value"]``, descendant (``a b``), child (``a > b``),
        compound (``div.cls#id``), multiple classes (``div.a.b``).

        Args:
            css_selector: The CSS selector string.

        Returns:
            A list of matching ``Tag`` objects.
        """
        parts = _parse_selector(css_selector)
        candidates: list[Tag] = self._all_descendants()
        return [tag for tag in candidates if _selector_matches(tag, parts)]

    def select_one(self, css_selector: str) -> Tag | None:
        """Like ``select``, but return only the first match (or ``None``).

        Args:
            css_selector: The CSS selector string.

        Returns:
            The first matching ``Tag``, or ``None``.
        """
        parts = _parse_selector(css_selector)
        for tag in self._all_descendants():
            if _selector_matches(tag, parts):
                return tag
        return None

    def _all_descendants(self) -> list[Tag]:
        """Collect all descendant Tag nodes in document order."""
        result: list[Tag] = []
        self._collect_descendants(result)
        return result

    def _collect_descendants(self, acc: list[Tag]) -> None:
        for child in self.children:
            if isinstance(child, Tag):
                acc.append(child)
                child._collect_descendants(acc)

    # ── Serialization ────────────────────────────────────────────────────

    def to_html(self) -> str:
        """Serialize this element and its descendants back to an HTML string.

        Returns:
            The HTML markup for this subtree.
        """
        parts: list[str] = []
        self._serialize(parts)
        return "".join(parts)

    def _serialize(self, acc: list[str]) -> None:
        """Recursively build HTML string pieces into *acc*."""
        # Build opening tag
        attr_parts: list[str] = []
        for k, v in self.attrs.items():
            if isinstance(v, list):
                attr_parts.append(f'{k}="{" ".join(v)}"')
            elif v == "":
                attr_parts.append(k)
            else:
                attr_parts.append(f'{k}="{v}"')
        attrs_str = (" " + " ".join(attr_parts)) if attr_parts else ""

        if self.name.lower() in SELF_CLOSING_TAGS:
            acc.append(f"<{self.name}{attrs_str}>")
            return

        acc.append(f"<{self.name}{attrs_str}>")
        for child in self.children:
            if isinstance(child, str):
                acc.append(child)
            else:
                child._serialize(acc)
        acc.append(f"</{self.name}>")

    def __str__(self) -> str:
        """Return the HTML serialization of this element."""
        return self.to_html()

    # ── Repr ──────────────────────────────────────────────────────────────

    def __repr__(self) -> str:
        attrs_str = ""
        if self.attrs:
            parts = []
            for k, v in self.attrs.items():
                if isinstance(v, list):
                    parts.append(f'{k}="{" ".join(v)}"')
                else:
                    parts.append(f'{k}="{v}"')
            attrs_str = " " + " ".join(parts)
        return f"<{self.name}{attrs_str}>"

`text` `property` ¶

Concatenated text content of this element and all descendants.

`string` `property` ¶

If this element has exactly one text child (possibly nested), return it.

Returns None when the element has no children, multiple children, or a mix of text and tags.

`get(attr, default=None)` ¶

Return attribute value, or default if not present.

Source code in soup/soup.py

def get(self, attr: str, default: Any = None) -> Any:
    """Return attribute value, or *default* if not present."""
    return self.attrs.get(attr, default)

`getitem(attr)` ¶

Return attribute value; raise KeyError if missing.

Source code in soup/soup.py

def __getitem__(self, attr: str) -> Any:
    """Return attribute value; raise ``KeyError`` if missing."""
    return self.attrs[attr]

`setitem(attr, value)` ¶

Set an attribute value (e.g. tag['id'] = 'main').

Source code in soup/soup.py

def __setitem__(self, attr: str, value: Any) -> None:
    """Set an attribute value (e.g. ``tag['id'] = 'main'``)."""
    self.attrs[attr] = value

`delitem(attr)` ¶

Delete an attribute (e.g. del tag['id']).

Source code in soup/soup.py

def __delitem__(self, attr: str) -> None:
    """Delete an attribute (e.g. ``del tag['id']``)."""
    del self.attrs[attr]

`get_text(separator='', strip=False)` ¶

Return all text under this element concatenated.

Parameters:

Name	Type	Description	Default
`separator`	`str`	Inserted between text fragments.	`''`
`strip`	`bool`	If `True` each fragment is whitespace-stripped and empty fragments are dropped.	`False`

Returns:

Type	Description
`str`	The combined text.

Source code in soup/soup.py

def get_text(self, separator: str = "", strip: bool = False) -> str:
    """Return all text under this element concatenated.

    Args:
        separator: Inserted between text fragments.
        strip: If ``True`` each fragment is whitespace-stripped and empty
            fragments are dropped.

    Returns:
        The combined text.
    """
    parts: list[str] = []
    self._collect_text(parts)
    if strip:
        parts = [p.strip() for p in parts]
        parts = [p for p in parts if p]
    return separator.join(parts)

`append(child)` ¶

Append child to this element's children.

If child is a Tag already attached to a parent, it is first detached from its old position.

Parameters:

Name	Type	Description	Default
`child`	`Tag \| str`	A `Tag` or plain text string to append.	required

Source code in soup/soup.py

def append(self, child: Tag | str) -> None:
    """Append *child* to this element's children.

    If *child* is a ``Tag`` already attached to a parent, it is first
    detached from its old position.

    Args:
        child: A ``Tag`` or plain text string to append.
    """
    if isinstance(child, Tag):
        if child.parent is not None:
            try:
                child.parent.children.remove(child)
            except ValueError:
                pass
        child.parent = self
    self.children.append(child)

`insert(index, child)` ¶

Insert child at index in this element's children.

Parameters:

Name	Type	Description	Default
`index`	`int`	Position to insert at (same semantics as `list.insert`).	required
`child`	`Tag \| str`	A `Tag` or plain text string to insert.	required

Source code in soup/soup.py

def insert(self, index: int, child: Tag | str) -> None:
    """Insert *child* at *index* in this element's children.

    Args:
        index: Position to insert at (same semantics as ``list.insert``).
        child: A ``Tag`` or plain text string to insert.
    """
    if isinstance(child, Tag):
        if child.parent is not None:
            try:
                child.parent.children.remove(child)
            except ValueError:
                pass
        child.parent = self
    self.children.insert(index, child)

`extract()` ¶

Remove this element from its parent but keep its content intact.

Unlike decompose, the element and its subtree remain usable after extraction.

Returns:

Type	Description
`Tag`	This element (now detached).

Source code in soup/soup.py

def extract(self) -> Tag:
    """Remove this element from its parent but keep its content intact.

    Unlike ``decompose``, the element and its subtree remain usable
    after extraction.

    Returns:
        This element (now detached).
    """
    if self.parent is not None:
        try:
            self.parent.children.remove(self)
        except ValueError:
            pass
        self.parent = None
    return self

`replace_with(new_node)` ¶

Replace this element with new_node in the parent's children.

Parameters:

Name	Type	Description	Default
`new_node`	`Tag \| str`	The replacement `Tag` or text string.	required

Returns:

Type	Description
`Tag`	This element (now detached).

Raises:

Type	Description
`ValueError`	If the element has no parent.

Source code in soup/soup.py

def replace_with(self, new_node: Tag | str) -> Tag:
    """Replace this element with *new_node* in the parent's children.

    Args:
        new_node: The replacement ``Tag`` or text string.

    Returns:
        This element (now detached).

    Raises:
        ValueError: If the element has no parent.
    """
    if self.parent is None:
        raise ValueError("Cannot replace a detached element")
    parent = self.parent
    for i, child in enumerate(parent.children):
        if child is self:
            parent.children[i] = new_node
            if isinstance(new_node, Tag):
                if new_node.parent is not None:
                    try:
                        new_node.parent.children.remove(new_node)
                    except ValueError:
                        pass
                new_node.parent = parent
            self.parent = None
            return self
    raise ValueError("Element not found in parent's children")  # pragma: no cover

`unwrap()` ¶

Remove this tag but keep its children (re-parent them).

The children are spliced into the parent's children list at the position formerly occupied by this element.

Source code in soup/soup.py

def unwrap(self) -> None:
    """Remove this tag but keep its children (re-parent them).

    The children are spliced into the parent's children list at the
    position formerly occupied by this element.
    """
    if self.parent is None:
        return
    parent = self.parent
    idx = next(i for i, c in enumerate(parent.children) if c is self)
    # Splice children into parent at the position of this element.
    for child in self.children:
        if isinstance(child, Tag):
            child.parent = parent
    parent.children[idx : idx + 1] = self.children
    self.children = []
    self.parent = None

`decompose()` ¶

Remove this element from its parent and discard its content.

Source code in soup/soup.py

def decompose(self) -> None:
    """Remove this element from its parent and discard its content."""
    if self.parent is not None:
        try:
            self.parent.children.remove(self)
        except ValueError:
            pass
        self.parent = None
    self.children.clear()

`find(name=None, attrs=None, *, class_=None, **kwargs)` ¶

Return the first descendant matching the criteria, or None.

Parameters:

Name	Type	Description	Default
`name`	`str \| list[str] \| None`	Tag name(s) to match. `None` matches any tag.	`None`
`attrs`	`dict[str, str \| bool] \| None`	Dict of attribute filters.	`None`
`class_`	`str \| None`	Shorthand for `attrs={"class": value}`.	`None`
`**kwargs`	`str \| bool`	Extra attribute filters (`href=True` means has href).	`{}`

Returns:

Type	Description
`Tag \| None`	The first matching `Tag`, or `None`.

Source code in soup/soup.py

def find(
    self,
    name: str | list[str] | None = None,
    attrs: dict[str, str | bool] | None = None,
    *,
    class_: str | None = None,
    **kwargs: str | bool,
) -> Tag | None:
    """Return the first descendant matching the criteria, or ``None``.

    Args:
        name: Tag name(s) to match. ``None`` matches any tag.
        attrs: Dict of attribute filters.
        class_: Shorthand for ``attrs={"class": value}``.
        **kwargs: Extra attribute filters (``href=True`` means *has* href).

    Returns:
        The first matching ``Tag``, or ``None``.
    """
    results = self.find_all(name, attrs, class_=class_, limit=1, **kwargs)
    return results[0] if results else None

`find_all(name=None, attrs=None, *, class_=None, limit=None, **kwargs)` ¶

Return all descendants matching the criteria.

Parameters:

Name	Type	Description	Default
`name`	`str \| list[str] \| None`	Tag name(s) to match.	`None`
`attrs`	`dict[str, str \| bool] \| None`	Dict of attribute filters.	`None`
`class_`	`str \| None`	Shorthand for `attrs={"class": value}`.	`None`
`limit`	`int \| None`	Stop after finding this many results.	`None`
`**kwargs`	`str \| bool`	Extra attribute filters.	`{}`

Returns:

Type	Description
`list[Tag]`	A list of matching `Tag` objects.

Source code in soup/soup.py

def find_all(
    self,
    name: str | list[str] | None = None,
    attrs: dict[str, str | bool] | None = None,
    *,
    class_: str | None = None,
    limit: int | None = None,
    **kwargs: str | bool,
) -> list[Tag]:
    """Return all descendants matching the criteria.

    Args:
        name: Tag name(s) to match.
        attrs: Dict of attribute filters.
        class_: Shorthand for ``attrs={"class": value}``.
        limit: Stop after finding this many results.
        **kwargs: Extra attribute filters.

    Returns:
        A list of matching ``Tag`` objects.
    """
    merged = dict(attrs) if attrs else {}
    if class_ is not None:
        merged["class"] = class_
    merged.update(kwargs)

    # Fast path: name-only search with no attribute filters.
    if not merged:
        if isinstance(name, str):
            results: list[Tag] = []
            self._search_by_single_name(name, results, limit)
            return results
        if isinstance(name, list):
            name_set: frozenset[str] = frozenset(name)
            results = []
            self._search_by_name_set(name_set, results, limit)
            return results

    if isinstance(name, list):
        name_set = frozenset(name)
    else:
        name_set = None  # type: ignore[assignment]

    results = []
    self._search(name, name_set, merged, results, limit)
    return results

`call(*args, **kwargs)` ¶

Calling a tag is equivalent to find_all.

Source code in soup/soup.py

def __call__(self, *args: Any, **kwargs: Any) -> list[Tag]:
    """Calling a tag is equivalent to ``find_all``."""
    return self.find_all(*args, **kwargs)

`find_parent(name=None)` ¶

Walk up the tree and return the first ancestor matching name.

Parameters:

Name	Type	Description	Default
`name`	`str \| None`	Tag name to match. `None` returns the immediate parent.	`None`

Returns:

Type	Description
`Tag \| None`	The matching ancestor `Tag`, or `None`.

Source code in soup/soup.py

def find_parent(self, name: str | None = None) -> Tag | None:
    """Walk up the tree and return the first ancestor matching *name*.

    Args:
        name: Tag name to match. ``None`` returns the immediate parent.

    Returns:
        The matching ancestor ``Tag``, or ``None``.
    """
    node = self.parent
    if name is None:
        return node
    while node is not None:
        if node.name == name:
            return node
        node = node.parent
    return None

`select(css_selector)` ¶

Return all descendants matching a CSS selector (simple subset).

Supported patterns: tag, .class, #id, [attr], [attr="value"], descendant (a b), child (a > b), compound (div.cls#id), multiple classes (div.a.b).

Parameters:

Name	Type	Description	Default
`css_selector`	`str`	The CSS selector string.	required

Returns:

Type	Description
`list[Tag]`	A list of matching `Tag` objects.

Source code in soup/soup.py

def select(self, css_selector: str) -> list[Tag]:
    """Return all descendants matching a CSS selector (simple subset).

    Supported patterns: ``tag``, ``.class``, ``#id``, ``[attr]``,
    ``[attr="value"]``, descendant (``a b``), child (``a > b``),
    compound (``div.cls#id``), multiple classes (``div.a.b``).

    Args:
        css_selector: The CSS selector string.

    Returns:
        A list of matching ``Tag`` objects.
    """
    parts = _parse_selector(css_selector)
    candidates: list[Tag] = self._all_descendants()
    return [tag for tag in candidates if _selector_matches(tag, parts)]

`select_one(css_selector)` ¶

Like select, but return only the first match (or None).

Parameters:

Name	Type	Description	Default
`css_selector`	`str`	The CSS selector string.	required

Returns:

Type	Description
`Tag \| None`	The first matching `Tag`, or `None`.

Source code in soup/soup.py

def select_one(self, css_selector: str) -> Tag | None:
    """Like ``select``, but return only the first match (or ``None``).

    Args:
        css_selector: The CSS selector string.

    Returns:
        The first matching ``Tag``, or ``None``.
    """
    parts = _parse_selector(css_selector)
    for tag in self._all_descendants():
        if _selector_matches(tag, parts):
            return tag
    return None

`to_html()` ¶

Serialize this element and its descendants back to an HTML string.

Returns:

Type	Description
`str`	The HTML markup for this subtree.

Source code in soup/soup.py

def to_html(self) -> str:
    """Serialize this element and its descendants back to an HTML string.

    Returns:
        The HTML markup for this subtree.
    """
    parts: list[str] = []
    self._serialize(parts)
    return "".join(parts)

`str()` ¶

Return the HTML serialization of this element.

Source code in soup/soup.py

def __str__(self) -> str:
    """Return the HTML serialization of this element."""
    return self.to_html()

`Soup` ¶

Bases: Tag

Parse an HTML document and provide a BeautifulSoup-like API.

Parameters:

Name	Type	Description	Default
`markup`	`str`	The HTML string to parse.	required
`parser`	`str`	Ignored (present only for API compatibility with BS4). Only `"html.parser"` is supported.	`'html.parser'`
`skip_tags`	`frozenset[str] \| None`	Optional frozenset of tag names to omit during parsing. Skipped tags and all their descendants are silently discarded, which can significantly speed up parsing of pages with many `<script>` or `<style>` blocks.	`None`

Example::

soup = Soup("<p>Hello <b>world</b></p>")
print(soup.find("b").text)
# world

Source code in soup/soup.py

class Soup(Tag):
    """Parse an HTML document and provide a BeautifulSoup-like API.

    Args:
        markup: The HTML string to parse.
        parser: Ignored (present only for API compatibility with BS4).
            Only ``"html.parser"`` is supported.
        skip_tags: Optional frozenset of tag names to omit during parsing.
            Skipped tags and all their descendants are silently discarded,
            which can significantly speed up parsing of pages with many
            ``<script>`` or ``<style>`` blocks.

    Example::

        soup = Soup("<p>Hello <b>world</b></p>")
        print(soup.find("b").text)
        # world
    """

    def __init__(
        self,
        markup: str,
        parser: str = "html.parser",
        skip_tags: frozenset[str] | None = None,
    ) -> None:
        super().__init__("[document]")
        builder = _TreeBuilder(skip_tags=skip_tags)
        builder.feed(markup)
        # Adopt the root's children as our own.
        self.children = builder.root.children
        for child in self.children:
            if isinstance(child, Tag):
                child.parent = self

    def new_tag(
        self, name: str, attrs: dict[str, str | list[str]] | None = None
    ) -> Tag:
        """Create a new detached ``Tag`` (not yet in the tree).

        Args:
            name: Tag name (e.g. ``"p"``).
            attrs: Optional attribute dictionary.

        Returns:
            A new ``Tag`` instance with no parent.
        """
        return Tag(name, attrs)

    def to_html(self) -> str:
        """Serialize the entire document back to an HTML string.

        Returns:
            The HTML markup for the whole document.
        """
        parts: list[str] = []
        for child in self.children:
            if isinstance(child, str):
                parts.append(child)
            else:
                child._serialize(parts)
        return "".join(parts)

`new_tag(name, attrs=None)` ¶

Create a new detached Tag (not yet in the tree).

Parameters:

Name	Type	Description	Default
`name`	`str`	Tag name (e.g. `"p"`).	required
`attrs`	`dict[str, str \| list[str]] \| None`	Optional attribute dictionary.	`None`

Returns:

Type	Description
`Tag`	A new `Tag` instance with no parent.

Source code in soup/soup.py

def new_tag(
    self, name: str, attrs: dict[str, str | list[str]] | None = None
) -> Tag:
    """Create a new detached ``Tag`` (not yet in the tree).

    Args:
        name: Tag name (e.g. ``"p"``).
        attrs: Optional attribute dictionary.

    Returns:
        A new ``Tag`` instance with no parent.
    """
    return Tag(name, attrs)

`to_html()` ¶

Serialize the entire document back to an HTML string.

Returns:

Type	Description
`str`	The HTML markup for the whole document.

Source code in soup/soup.py

def to_html(self) -> str:
    """Serialize the entire document back to an HTML string.

    Returns:
        The HTML markup for the whole document.
    """
    parts: list[str] = []
    for child in self.children:
        if isinstance(child, str):
            parts.append(child)
        else:
            child._serialize(parts)
    return "".join(parts)

Soup API Reference¶

soup ¶

Tag ¶

text property ¶

string property ¶

get(attr, default=None) ¶

__getitem__(attr) ¶

__setitem__(attr, value) ¶

__delitem__(attr) ¶

get_text(separator='', strip=False) ¶

append(child) ¶

insert(index, child) ¶

extract() ¶

replace_with(new_node) ¶

unwrap() ¶

decompose() ¶

find(name=None, attrs=None, *, class_=None, **kwargs) ¶

find_all(name=None, attrs=None, *, class_=None, limit=None, **kwargs) ¶

__call__(*args, **kwargs) ¶

find_parent(name=None) ¶

select(css_selector) ¶

select_one(css_selector) ¶

to_html() ¶

__str__() ¶

Soup ¶

new_tag(name, attrs=None) ¶

to_html() ¶

`soup` ¶

`Tag` ¶

`text` `property` ¶

`string` `property` ¶

`get(attr, default=None)` ¶

`getitem(attr)` ¶

`setitem(attr, value)` ¶

`delitem(attr)` ¶

`get_text(separator='', strip=False)` ¶

`append(child)` ¶

`insert(index, child)` ¶

`extract()` ¶

`replace_with(new_node)` ¶

`unwrap()` ¶

`decompose()` ¶

`find(name=None, attrs=None, *, class_=None, **kwargs)` ¶

`find_all(name=None, attrs=None, *, class_=None, limit=None, **kwargs)` ¶

`call(*args, **kwargs)` ¶

`find_parent(name=None)` ¶

`select(css_selector)` ¶

`select_one(css_selector)` ¶

`to_html()` ¶

`str()` ¶

`Soup` ¶

`new_tag(name, attrs=None)` ¶

`to_html()` ¶