Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions tests/test_page_inputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,15 @@
import pytest
import requests

import parsel
from web_poet.page_inputs import (
HttpRequest,
HttpResponse,
HttpRequestBody,
HttpResponseBody,
HttpRequestHeaders,
HttpResponseHeaders,
BrowserHtml,
)


Expand Down Expand Up @@ -421,3 +423,14 @@ def test_html5_meta_charset():
response = HttpResponse("http://www.example.com", body=body)
assert response.encoding == 'gb18030'
assert response.text == body.decode('gb18030')


def test_browser_html():
src = "<html><body><p>Hello, </p><p>world!</p></body></html>"
html = BrowserHtml(src)
assert html == src
assert html != "foo"

assert html.xpath("//p/text()").getall() == ["Hello, ", "world!"]
assert html.css("p::text").getall() == ["Hello, ", "world!"]
assert isinstance(html.selector, parsel.Selector)
1 change: 1 addition & 0 deletions web_poet/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
HttpResponseHeaders,
HttpRequestBody,
HttpResponseBody,
BrowserHtml,
)
from .overrides import PageObjectRegistry, consume_modules, OverrideRule

Expand Down
55 changes: 40 additions & 15 deletions web_poet/mixins.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,48 @@
import abc
from urllib.parse import urljoin

import parsel
from w3lib.html import get_base_url


class ResponseShortcutsMixin:
class SelectableMixin(abc.ABC):
"""
Inherit from this mixin, implement ``._selector_input`` method,
get ``.selector`` property and ``.xpath`` / ``.css`` methods.
"""
__cached_selector = None

@abc.abstractmethod
def _selector_input(self) -> str:
raise NotImplementedError() # pragma: nocover

@property
def selector(self) -> parsel.Selector:
"""Cached instance of :external:class:`parsel.selector.Selector`."""
# XXX: caching is implemented in a manual way to avoid issues with
# non-hashable classes, where memoizemethod_noargs doesn't work
if self.__cached_selector is not None:
return self.__cached_selector
# XXX: should we pass base_url=self.url, as Scrapy does?
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we can remove this comment since it's being used by BrowserHtml which doesn't rely on a url. Or do you foresee a need for it later on @kmike ?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If that's a right thing to do (which I'm not sure about - it seems it's not needed), we'd need to have URL for selectors to work properly. In this case, having a class like BrowserResponse, which contains both URL and HTML (similar to what we had with AutoextractHtml), might be better.

That said, it won't be a part of BrowserHtml, so it does make sense to remove the comment, thanks @BurnzZ!

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually I think it might be better to keep the comment, as it's a part of SelectableMixin, not of BrowserHtml class.

sel = parsel.Selector(text=self._selector_input())
self.__cached_selector = sel
return sel

def xpath(self, query, **kwargs):
"""A shortcut to ``.selector.xpath()``."""
return self.selector.xpath(query, **kwargs)

def css(self, query):
"""A shortcut to ``.selector.css()``."""
return self.selector.css(query)


# TODO: when dropping Python 3.7 support,
# fix untyped ResponseShortcutsMixin.response using typing.Protocol

class ResponseShortcutsMixin(SelectableMixin):
"""Common shortcut methods for working with HTML responses.
This mixin could be used with Page Object base classes.

It requires "response" attribute to be present.
"""
Expand All @@ -21,20 +58,8 @@ def html(self):
"""Shortcut to HTML Response's content."""
return self.response.text

@property
def selector(self) -> parsel.Selector:
"""``parsel.Selector`` instance for the HTML Response."""
# TODO: when dropping Python 3.7 support,
# implement it using typing.Protocol
return self.response.selector # type: ignore

def xpath(self, query, **kwargs):
"""Run an XPath query on a response, using :class:`parsel.Selector`."""
return self.selector.xpath(query, **kwargs)

def css(self, query):
"""Run a CSS query on a response, using :class:`parsel.Selector`."""
return self.selector.css(query)
def _selector_input(self) -> str:
return self.html

@property
def base_url(self) -> str:
Expand Down
1 change: 1 addition & 0 deletions web_poet/page_inputs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@
HttpRequestBody,
HttpResponseBody,
)
from .browser import BrowserHtml
10 changes: 10 additions & 0 deletions web_poet/page_inputs/browser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from web_poet.mixins import SelectableMixin


class BrowserHtml(SelectableMixin, str):
""" HTML returned by a web browser,
i.e. snapshot of the DOM tree in HTML format.
"""
def _selector_input(self) -> str:
return self

25 changes: 6 additions & 19 deletions web_poet/page_inputs/http.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
import attrs
import json
import parsel
from typing import Optional, Dict, List, Type, TypeVar, Union, Tuple, AnyStr

import attrs
from w3lib.encoding import (
html_to_unicode,
html_body_declared_encoding,
Expand All @@ -12,6 +11,7 @@

from web_poet._base import _HttpHeaders
from web_poet.utils import memoizemethod_noargs
from web_poet.mixins import SelectableMixin

T_headers = TypeVar("T_headers", bound="HttpResponseHeaders")

Expand Down Expand Up @@ -163,7 +163,7 @@ class HttpRequest:


@attrs.define(auto_attribs=False, slots=False, eq=False)
class HttpResponse:
class HttpResponse(SelectableMixin):
"""A container for the contents of a response, downloaded directly using an
HTTP client.

Expand Down Expand Up @@ -213,6 +213,9 @@ def text(self) -> str:
self._cached_text = text
return self._cached_text

def _selector_input(self) -> str:
return self.text

@property
def encoding(self):
""" Encoding of the response """
Expand All @@ -223,22 +226,6 @@ def encoding(self):
or self._body_inferred_encoding()
)

# XXX: see https://github.com/python/mypy/issues/1362
@property # type: ignore
@memoizemethod_noargs
def selector(self) -> parsel.Selector:
"""Cached instance of :external:class:`parsel.selector.Selector`."""
# XXX: should we pass base_url=self.url, as Scrapy does?
return parsel.Selector(text=self.text)

def xpath(self, query, **kwargs):
"""A shortcut to ``HttpResponse.selector.xpath()``."""
return self.selector.xpath(query, **kwargs)

def css(self, query):
"""A shortcut to ``HttpResponse.selector.css()``."""
return self.selector.css(query)

@memoizemethod_noargs
def json(self):
""" Deserialize a JSON document to a Python object. """
Expand Down