Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[text analytics] add domain_filter param #13451

Merged
merged 5 commits into from
Sep 1, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
OpinionSentiment,
RecognizePiiEntitiesResult,
PiiEntity,
PiiEntityDomainType,
)

__all__ = [
Expand Down Expand Up @@ -59,6 +60,7 @@
'OpinionSentiment',
'RecognizePiiEntitiesResult',
'PiiEntity',
'PiiEntityDomainType',
]

__version__ = VERSION
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
# Licensed under the MIT License.
# ------------------------------------
import re
from enum import Enum
from ._generated.models import (
LanguageInput,
MultiLanguageInput,
Expand Down Expand Up @@ -64,6 +65,10 @@ def get(self, key, default=None):
return self.__dict__[key]
return default

class PiiEntityDomainType(str, Enum):
"""The different domains of PII entities that users can filter by"""
PROTECTED_HEALTH_INFORMATION = "PHI" # See https://aka.ms/tanerpii for more information.


class DetectedLanguage(DictMixin):
"""DetectedLanguage contains the predicted language found in text,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,10 @@ def recognize_pii_entities( # type: ignore
be used for scoring, e.g. "latest", "2019-10-01". If a model-version
is not specified, the API will default to the latest, non-preview version.
:keyword bool show_stats: If set to true, response will contain document level statistics.
:keyword domain_filter: Filters the response entities to ones only included in the specified domain.
I.e., if set to 'PHI', will only return entities in the Protected Healthcare Information domain.
See https://aka.ms/tanerpii for more information.
:paramtype domain_filter: str or ~azure.ai.textanalytics.PiiEntityDomainType
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need the same :versionadded: directive here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, they had it on v3.1-preview.1, just never supported it

:return: The combined list of :class:`~azure.ai.textanalytics.RecognizePiiEntitiesResult`
and :class:`~azure.ai.textanalytics.DocumentError` in the order the original documents
were passed in.
Expand All @@ -281,13 +285,15 @@ def recognize_pii_entities( # type: ignore
docs = _validate_input(documents, "language", language)
model_version = kwargs.pop("model_version", None)
show_stats = kwargs.pop("show_stats", False)
domain_filter = kwargs.pop("domain_filter", None)
if self._string_code_unit:
kwargs.update({"string_index_type": self._string_code_unit})
try:
return self._client.entities_recognition_pii(
documents=docs,
model_version=model_version,
show_stats=show_stats,
domain=domain_filter,
cls=kwargs.pop("cls", pii_entities_result),
**kwargs
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,10 @@ async def recognize_pii_entities( # type: ignore
be used for scoring, e.g. "latest", "2019-10-01". If a model-version
is not specified, the API will default to the latest, non-preview version.
:keyword bool show_stats: If set to true, response will contain document level statistics.
:keyword domain_filter: Filters the response entities to ones only included in the specified domain.
I.e., if set to 'PHI', will only return entities in the Protected Healthcare Information domain.
See https://aka.ms/tanerpii for more information.
:paramtype domain_filter: str or ~azure.ai.textanalytics.PiiEntityDomainType
:return: The combined list of :class:`~azure.ai.textanalytics.RecognizePiiEntitiesResult`
and :class:`~azure.ai.textanalytics.DocumentError` in the order the original documents
were passed in.
Expand All @@ -283,13 +287,16 @@ async def recognize_pii_entities( # type: ignore
docs = _validate_input(documents, "language", language)
model_version = kwargs.pop("model_version", None)
show_stats = kwargs.pop("show_stats", False)
domain_filter = kwargs.pop("domain_filter", None)

if self._string_code_unit:
kwargs.update({"string_index_type": self._string_code_unit})
try:
return await self._client.entities_recognition_pii(
documents=docs,
model_version=model_version,
show_stats=show_stats,
domain=domain_filter,
cls=kwargs.pop("cls", pii_entities_result),
**kwargs
)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
interactions:
- request:
body: '{"documents": [{"id": "0", "text": "I work at Microsoft and my phone number
is 333-333-3333", "language": "en"}]}'
headers:
Accept:
- application/json, text/json
Accept-Encoding:
- gzip, deflate
Connection:
- keep-alive
Content-Length:
- '113'
Content-Type:
- application/json
User-Agent:
- azsdk-python-ai-textanalytics/5.0.1 Python/3.8.5 (macOS-10.13.6-x86_64-i386-64bit)
method: POST
uri: https://westus2.api.cognitive.microsoft.com/text/analytics/v3.1-preview.1/entities/recognition/pii?showStats=false&domain=PHI&stringIndexType=UnicodeCodePoint
response:
body:
string: '{"documents":[{"id":"0","entities":[{"text":"333-333-3333","category":"Phone
Number","offset":43,"length":12,"confidenceScore":0.8}],"warnings":[]}],"errors":[],"modelVersion":"2020-07-01"}'
headers:
apim-request-id:
- c2319b95-6fd2-46c9-80e3-06c8f2701825
content-type:
- application/json; charset=utf-8
csp-billing-usage:
- CognitiveServices.TextAnalytics.BatchScoring=1
date:
- Mon, 31 Aug 2020 20:32:54 GMT
strict-transport-security:
- max-age=31536000; includeSubDomains; preload
transfer-encoding:
- chunked
x-content-type-options:
- nosniff
x-envoy-upstream-service-time:
- '79'
status:
code: 200
message: OK
version: 1
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
interactions:
- request:
body: '{"documents": [{"id": "0", "text": "I work at Microsoft and my phone number
is 333-333-3333", "language": "en"}]}'
headers:
Accept:
- application/json, text/json
Content-Length:
- '113'
Content-Type:
- application/json
User-Agent:
- azsdk-python-ai-textanalytics/5.0.1 Python/3.8.5 (macOS-10.13.6-x86_64-i386-64bit)
method: POST
uri: https://westus2.api.cognitive.microsoft.com/text/analytics/v3.1-preview.1/entities/recognition/pii?showStats=false&domain=PHI&stringIndexType=UnicodeCodePoint
response:
body:
string: '{"documents":[{"id":"0","entities":[{"text":"333-333-3333","category":"Phone
Number","offset":43,"length":12,"confidenceScore":0.8}],"warnings":[]}],"errors":[],"modelVersion":"2020-07-01"}'
headers:
apim-request-id: 9265752d-3262-4dbb-94d6-be26889e3db9
content-type: application/json; charset=utf-8
csp-billing-usage: CognitiveServices.TextAnalytics.BatchScoring=1
date: Mon, 31 Aug 2020 20:32:55 GMT
strict-transport-security: max-age=31536000; includeSubDomains; preload
transfer-encoding: chunked
x-content-type-options: nosniff
x-envoy-upstream-service-time: '82'
status:
code: 200
message: OK
url: https://westus2.api.cognitive.microsoft.com//text/analytics/v3.1-preview.1/entities/recognition/pii?showStats=false&domain=PHI&stringIndexType=UnicodeCodePoint
version: 1
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
TextDocumentInput,
VERSION,
TextAnalyticsApiVersion,
PiiEntityDomainType,
)

# pre-apply the client_cls positional argument so it needn't be explicitly passed below
Expand Down Expand Up @@ -573,4 +574,17 @@ def test_recognize_pii_entities_v3(self, client):
with pytest.raises(NotImplementedError) as excinfo:
client.recognize_pii_entities(["this should fail"])

assert "'recognize_pii_entities' endpoint is only available for API version v3.1-preview.1 and up" in str(excinfo.value)
assert "'recognize_pii_entities' endpoint is only available for API version v3.1-preview.1 and up" in str(excinfo.value)

@GlobalTextAnalyticsAccountPreparer()
@TextAnalyticsClientPreparer()
def test_phi_domain_filter(self, client):
# without the domain filter, this should return two entities: Microsoft as an org,
# and the phone number. With the domain filter, it should only return one.
result = client.recognize_pii_entities(
["I work at Microsoft and my phone number is 333-333-3333"],
domain_filter=PiiEntityDomainType.PROTECTED_HEALTH_INFORMATION
)
self.assertEqual(len(result[0].entities), 1)
self.assertEqual(result[0].entities[0].text, '333-333-3333')
self.assertEqual(result[0].entities[0].category, 'Phone Number')
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
TextDocumentInput,
VERSION,
TextAnalyticsApiVersion,
PiiEntityDomainType,
)

# pre-apply the client_cls positional argument so it needn't be explicitly passed below
Expand Down Expand Up @@ -572,3 +573,16 @@ async def test_recognize_pii_entities_v3(self, client):
await client.recognize_pii_entities(["this should fail"])

assert "'recognize_pii_entities' endpoint is only available for API version v3.1-preview.1 and up" in str(excinfo.value)

@GlobalTextAnalyticsAccountPreparer()
@TextAnalyticsClientPreparer()
async def test_phi_domain_filter(self, client):
# without the domain filter, this should return two entities: Microsoft as an org,
# and the phone number. With the domain filter, it should only return one.
result = await client.recognize_pii_entities(
["I work at Microsoft and my phone number is 333-333-3333"],
domain_filter=PiiEntityDomainType.PROTECTED_HEALTH_INFORMATION
)
self.assertEqual(len(result[0].entities), 1)
self.assertEqual(result[0].entities[0].text, '333-333-3333')
self.assertEqual(result[0].entities[0].category, 'Phone Number')