-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconcepts.py
46 lines (36 loc) · 1.63 KB
/
concepts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
from rake_nltk import Rake
import re
def extract_keyphrases(text, num_phrases=5):
# Initialize RAKE with parameters
r = Rake()
# Extract keywords from the text
r.extract_keywords_from_text(text)
# Get the top keywords by score
keyphrases = r.get_ranked_phrases()
# Filter for unique and meaningful keyphrases
unique_keyphrases = []
for phrase in keyphrases:
# Clean up the phrase (remove special characters, extra spaces)
cleaned_phrase = re.sub(r'\W+', ' ', phrase.lower()).strip()
# Check if the cleaned phrase is not empty and not already in unique_keyphrases
if cleaned_phrase and cleaned_phrase not in unique_keyphrases:
unique_keyphrases.append(cleaned_phrase)
# Stop if we have enough keyphrases
if len(unique_keyphrases) >= num_phrases:
break
return unique_keyphrases[:num_phrases]
# Example text
text = """
Natural language processing (NLP) is a subfield of artificial intelligence concerned with the interaction
between computers and humans in natural language. TextRank is an extractive summarization technique
that finds the most informative sentences in a text. It works by building a graph where sentences are nodes
and edges denote the similarity between sentences.
"""
# Extract keyphrases using refined approach
keyphrases = extract_keyphrases(text)
# Print extracted keyphrases
print("Extracted Keyphrases:")
for phrase in keyphrases:
print(f"- {phrase.capitalize()}") # Capitalize first letter for better readability
# You can add more elaboration or description for each keyphrase here if needed
print()