Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement Document factory constructors on language client #2164

Merged
merged 5 commits into from
Aug 23, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,7 @@

language-usage
Client <language-client>
language-document

.. toctree::
:maxdepth: 0
Expand Down
6 changes: 6 additions & 0 deletions docs/language-document.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
Document
~~~~~~~~

.. automodule:: gcloud.language.document
:members:
:show-inheritance:
8 changes: 4 additions & 4 deletions docs/language-usage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -127,21 +127,21 @@ to content stored in `Google Cloud Storage`_. We can use the

.. code-block:: python

>>> document = client.document_from_blob(bucket='my-text-bucket',
... blob='sentiment-me.txt')
>>> document = client.document_from_blob('my-text-bucket',
... 'sentiment-me.txt')

This comment was marked as spam.

This comment was marked as spam.

>>> document.gcs_url
'gs://my-text-bucket/sentiment-me.txt'
>>> document.doc_type == language.Document.PLAIN_TEXT
True

and the :meth:`~gcloud.language.client.Client.document_from_uri`
and the :meth:`~gcloud.language.client.Client.document_from_url`
method. In either case, the document type can be specified with
the ``doc_type`` argument:

.. code-block:: python

>>> gcs_url = 'gs://my-text-bucket/sentiment-me.txt'
>>> document = client.document_from_uri(
>>> document = client.document_from_url(
... gcs_url, doc_type=language.Document.HTML)
>>> document.gcs_url == gcs_url
True
Expand Down
1 change: 1 addition & 0 deletions gcloud/language/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,4 @@
"""Client library for Google Cloud Natural Language API."""

from gcloud.language.client import Client
from gcloud.language.document import Document
93 changes: 93 additions & 0 deletions gcloud/language/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

from gcloud.client import JSONClient
from gcloud.language.connection import Connection
from gcloud.language.document import Document


class Client(JSONClient):
Expand All @@ -40,3 +41,95 @@ class Client(JSONClient):
"""

_connection_class = Connection

def document_from_text(self, content, **kwargs):
"""Create a plain text document bound to this client.

:type content: str
:param content: The document plain text content.

:type kwargs: dict
:param kwargs: Remaining keyword arguments to be passed along to the
:class:`Document` constructor.

:rtype: :class:`Document`
:returns: A plain-text document bound to this client.
:raises: :class:`~exceptions.TypeError` if ``doc_type`` is passed as a

This comment was marked as spam.

This comment was marked as spam.

This comment was marked as spam.

This comment was marked as spam.

keyword argument.
"""
if 'doc_type' in kwargs:
raise TypeError('Cannot pass doc_type')
return Document(self, content=content,
doc_type=Document.PLAIN_TEXT, **kwargs)

def document_from_html(self, content, **kwargs):
"""Create an HTML document bound to this client.

:type content: str
:param content: The document HTML text content.

:type kwargs: dict
:param kwargs: Remaining keyword arguments to be passed along to the
:class:`Document` constructor.

:rtype: :class:`Document`
:returns: An HTML document bound to this client.
:raises: :class:`~exceptions.TypeError` if ``doc_type`` is passed as a
keyword argument.
"""
if 'doc_type' in kwargs:
raise TypeError('Cannot pass doc_type')
return Document(self, content=content,
doc_type=Document.HTML, **kwargs)

def document_from_url(self, gcs_url,
doc_type=Document.PLAIN_TEXT, **kwargs):
"""Create a Cloud Storage document bound to this client.

:type gcs_url: str
:param gcs_url: The URL of the Google Cloud Storage object
holding the content. Of the form
``gs://{bucket}/{blob-name}``.

:type doc_type: str
:param doc_type: (Optional) The type of text in the document.
Defaults to plain text. Can also be specified
as HTML via :attr:`~.Document.HTML`.

:type kwargs: dict
:param kwargs: Remaining keyword arguments to be passed along to the
:class:`Document` constructor.

:rtype: :class:`Document`
:returns: A document bound to this client.
"""
return Document(self, gcs_url=gcs_url, doc_type=doc_type, **kwargs)

def document_from_blob(self, bucket_name, blob_name,
doc_type=Document.PLAIN_TEXT, **kwargs):
"""Create a Cloud Storage document bound to this client.

:type bucket_name: str
:param bucket_name: The name of the bucket that contains the
document text.

:type blob_name: str
:param blob_name: The name of the blob (within the bucket) that
contains document text.

:type doc_type: str
:param doc_type: (Optional) The type of text in the document.
Defaults to plain text. Can also be specified
as HTML via :attr:`~.Document.HTML`.

:type kwargs: dict
:param kwargs: Remaining keyword arguments to be passed along to the
:class:`Document` constructor.

:rtype: :class:`Document`
:returns: A document bound to this client.
"""
# NOTE: We assume that the bucket and blob name don't
# need to be URL-encoded.
gcs_url = 'gs://%s/%s' % (bucket_name, blob_name)
return self.document_from_url(gcs_url, doc_type=doc_type, **kwargs)
103 changes: 103 additions & 0 deletions gcloud/language/document.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
# Copyright 2016 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Definition for Google Cloud Natural Language API documents.

A document is used to hold text to be analyzed and annotated.
"""


DEFAULT_LANGUAGE = 'en'
"""Default document language, English."""


class Encoding(object):
"""Document text encoding types."""

NONE = 'NONE'
"""Unspecified encoding type."""

UTF8 = 'UTF8'
"""UTF-8 encoding type."""

UTF16 = 'UTF16'
"""UTF-16 encoding type."""

UTF32 = 'UTF32'
"""UTF-32 encoding type."""


class Document(object):
"""Document to send to Google Cloud Natural Language API.

Represents either plain text or HTML, and the content is either
stored on the document or referred to in a Google Cloud Storage
object.

:type client: :class:`~gcloud.language.client.Client`
:param client: A client which holds credentials and project
configuration.

:type content: str
:param content: (Optional) The document text content (either plain
text or HTML).

:type gcs_url: str
:param gcs_url: (Optional) The URL of the Google Cloud Storage object
holding the content. Of the form
``gs://{bucket}/{blob-name}``.

:type doc_type: str
:param doc_type: (Optional) The type of text in the document.
Defaults to plain text. Can be one of
:attr:`~.Document.PLAIN_TEXT` or
or :attr:`~.Document.HTML`.

:type language: str
:param language: (Optional) The language of the document text.
Defaults to :data:`DEFAULT_LANGUAGE`.

:type encoding: str
:param encoding: (Optional) The encoding of the document text.
Defaults to UTF-8. Can be one of
:attr:`~.Encoding.UTF8`, :attr:`~.Encoding.UTF16`
or :attr:`~.Encoding.UTF32`.

:raises: :class:`~exceptions.ValueError` both ``content`` and ``gcs_url``
are specified or if neither are specified.
"""

TYPE_UNSPECIFIED = 'TYPE_UNSPECIFIED'
"""Unspecified document type."""

PLAIN_TEXT = 'PLAIN_TEXT'
"""Plain text document type."""

HTML = 'HTML'
"""HTML document type."""

def __init__(self, client, content=None, gcs_url=None, doc_type=PLAIN_TEXT,
language=DEFAULT_LANGUAGE, encoding=Encoding.UTF8):
if content is not None and gcs_url is not None:
raise ValueError('A Document cannot contain both local text and '
'a link to text in a Google Cloud Storage object')
if content is None and gcs_url is None:
raise ValueError('A Document must contain either local text or a '
'link to text in a Google Cloud Storage object')
self.client = client
self.content = content
self.gcs_url = gcs_url
self.doc_type = doc_type
self.language = language
self.encoding = encoding
106 changes: 105 additions & 1 deletion gcloud/language/test_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,118 @@ def _makeOne(self, *args, **kw):

def test_ctor(self):
from gcloud.language.connection import Connection

project = 'PROJECT'
creds = _Credentials()
http = object()
client = self._makeOne(project=project, credentials=creds, http=http)
self.assertTrue(isinstance(client.connection, Connection))
self.assertIsInstance(client.connection, Connection)
self.assertTrue(client.connection.credentials is creds)
self.assertTrue(client.connection.http is http)

def test_document_from_text_factory(self):
from gcloud.language.document import Document

creds = _Credentials()
client = self._makeOne(project='PROJECT',
credentials=creds, http=object())

content = 'abc'
language = 'es'
document = client.document_from_text(content, language=language)
self.assertIsInstance(document, Document)
self.assertIs(document.client, client)
self.assertEqual(document.content, content)
# Test the default arg.
self.assertEqual(document.doc_type, Document.PLAIN_TEXT)
# Test the kwargs as well.
self.assertEqual(document.language, language)

def test_document_from_text_factory_failure(self):
creds = _Credentials()
client = self._makeOne(project='PROJECT',
credentials=creds, http=object())

with self.assertRaises(TypeError):
client.document_from_text('abc', doc_type='foo')

def test_document_from_html_factory(self):
from gcloud.language.document import Document

creds = _Credentials()
client = self._makeOne(project='PROJECT',
credentials=creds, http=object())

content = '<html>abc</html>'
language = 'ja'
document = client.document_from_html(content, language=language)
self.assertIsInstance(document, Document)
self.assertIs(document.client, client)
self.assertEqual(document.content, content)
# Test the default arg.
self.assertEqual(document.doc_type, Document.HTML)
# Test the kwargs as well.
self.assertEqual(document.language, language)

def test_document_from_html_factory_failure(self):
creds = _Credentials()
client = self._makeOne(project='PROJECT',
credentials=creds, http=object())

with self.assertRaises(TypeError):
client.document_from_html('abc', doc_type='foo')

def test_document_from_url_factory(self):
from gcloud.language.document import Document

creds = _Credentials()
client = self._makeOne(project='PROJECT',
credentials=creds, http=object())

gcs_url = 'gs://my-text-bucket/sentiment-me.txt'
document = client.document_from_url(gcs_url)
self.assertIsInstance(document, Document)
self.assertIs(document.client, client)
self.assertIsNone(document.content)
self.assertEqual(document.gcs_url, gcs_url)
self.assertEqual(document.doc_type, Document.PLAIN_TEXT)

def test_document_from_url_factory_explicit(self):
from gcloud.language.document import Document
from gcloud.language.document import Encoding

creds = _Credentials()
client = self._makeOne(project='PROJECT',
credentials=creds, http=object())

encoding = Encoding.UTF32
gcs_url = 'gs://my-text-bucket/sentiment-me.txt'
document = client.document_from_url(gcs_url, doc_type=Document.HTML,
encoding=encoding)
self.assertIsInstance(document, Document)
self.assertIs(document.client, client)
self.assertIsNone(document.content)
self.assertEqual(document.gcs_url, gcs_url)
self.assertEqual(document.doc_type, Document.HTML)
self.assertEqual(document.encoding, encoding)

def test_document_from_blob_factory(self):
from gcloud.language.document import Document

creds = _Credentials()
client = self._makeOne(project='PROJECT',
credentials=creds, http=object())

bucket_name = 'my-text-bucket'
blob_name = 'sentiment-me.txt'
gcs_url = 'gs://%s/%s' % (bucket_name, blob_name)
document = client.document_from_blob(bucket_name, blob_name)
self.assertIsInstance(document, Document)
self.assertIs(document.client, client)
self.assertIsNone(document.content)
self.assertEqual(document.gcs_url, gcs_url)
self.assertEqual(document.doc_type, Document.PLAIN_TEXT)


class _Credentials(object):

Expand Down
Loading