Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

import fix #512

Merged
merged 3 commits into from
Sep 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pkgs/community/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
"numpy", # Common dependencies for all distributions
"requests",
"pydantic",
"pymupdf",
"swarmauri-core==0.5.0.dev7",
"swarmauri==0.5.0.dev7"
],
Expand Down
Original file line number Diff line number Diff line change
@@ -1,11 +1,32 @@
from typing import Any
from typing import Any, Literal
import pandas as pd
from sklearn.feature_selection import mutual_info_classif
from swarmauri.metrics.base.MetricBase import MetricBase
from swarmauri.metrics.base.MetricCalculateMixin import MetricCalculateMixin


class MutualInformationMetric(MetricBase, MetricCalculateMixin):
def calculate(self, data: pd.DataFrame, target_column: str) -> float: # Now returns a float
"""
A metric class to calculate mutual information between features and a target column in a given dataset.

This class computes the mutual information between each feature in a DataFrame (excluding the target column)
and the target column itself, and returns the average mutual information score.
"""

type: Literal["MutualInformationMetric"] = "MutualInformationMetric"

def calculate(self, data: pd.DataFrame, target_column: str) -> float:
"""
Calculate the average mutual information between the features and the target column.

Parameters:
- data (pd.DataFrame): A DataFrame containing both the features and the target column.

- target_column (str) The name of the target column in the DataFrame.

Returns:
- float: The average mutual information across all feature columns.
"""
# Separate features from the target column
features_data = data.drop(columns=[target_column])
target_data = data[target_column]
Expand All @@ -14,4 +35,4 @@ def calculate(self, data: pd.DataFrame, target_column: str) -> float: # Now ret
mi = mutual_info_classif(features_data, target_data)

# Return the average mutual information across all features
return float(mi.mean()) # Output as a float
return float(mi.mean()) # Output as a float
18 changes: 10 additions & 8 deletions pkgs/community/swarmauri_community/parsers/FitzPdfParser.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,17 @@
import fitz # PyMuPDF
import pymupdf # PyMuPDF
from typing import List, Union, Any, Literal
from swarmauri.standard.parsers.base.ParserBase import ParserBase
from swarmauri.core.documents.IDocument import IDocument
from swarmauri.standard.documents.concrete.Document import Document
from swarmauri.parsers.base.ParserBase import ParserBase
from swarmauri_core.documents.IDocument import IDocument
from swarmauri.documents.concrete.Document import Document


class PDFtoTextParser(ParserBase):
"""
A parser to extract text from PDF files.
"""
type: Literal['FitzPdfParser'] = 'FitzPdfParser'


type: Literal["FitzPdfParser"] = "FitzPdfParser"

def parse(self, data: Union[str, Any]) -> List[IDocument]:
"""
Parses a PDF file and extracts its text content as Document instances.
Expand All @@ -26,7 +28,7 @@ def parse(self, data: Union[str, Any]) -> List[IDocument]:

try:
# Open the PDF file
doc = fitz.open(data)
doc = pymupdf.open(data)
text = ""

# Extract text from each page
Expand All @@ -37,7 +39,7 @@ def parse(self, data: Union[str, Any]) -> List[IDocument]:
# Create a document with the extracted text
document = Document(content=text, metadata={"source": data})
return [document]

except Exception as e:
print(f"An error occurred while parsing the PDF: {e}")
return []
21 changes: 14 additions & 7 deletions pkgs/community/tests/unit/metrics/MutualInformationMetric_test.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,31 @@
import pytest
from swarmauri.metrics.concrete.MutualInformationMetric import MutualInformationMetric as Metric
from swarmauri_community.metrics.MutualInformationMetric import (
MutualInformationMetric as Metric,
)


@pytest.mark.unit
def test_ubc_resource():
assert Metric(unit='points', value=10).resource == 'Metric'
assert Metric(unit="points", value=10).resource == "Metric"


@pytest.mark.unit
def test_ubc_type():
metric = Metric(unit='points', value=10)
assert metric.type == 'MutualInformationMetric'
metric = Metric(unit="points", value=10)
assert metric.type == "MutualInformationMetric"


@pytest.mark.unit
def test_serialization():
metric = Metric(unit='points', value=10)
metric = Metric(unit="points", value=10)
assert metric.id == Metric.model_validate_json(metric.model_dump_json()).id


@pytest.mark.unit
def test_metric_value():
assert Metric(unit='points', value=10)() == 10
assert Metric(unit="points", value=10)() == 10


@pytest.mark.unit
def test_metric_unit():
assert Metric(unit='points', value=10).unit == 'bad assertion value'
assert Metric(unit="points", value=10).unit == "points"
19 changes: 11 additions & 8 deletions pkgs/community/tests/unit/parsers/FitzPdfParser_test.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,28 @@

import pytest
from swarmauri.community.parsers.FitzPdfParser import PDFtoTextParser as Parser
from swarmauri_community.parsers.FitzPdfParser import PDFtoTextParser as Parser


@pytest.mark.unit
def test_ubc_resource():
parser = Parser()
assert parser.resource == 'Parser'
assert parser.resource == "Parser"


@pytest.mark.unit
def test_ubc_type():
parser = Parser()
assert parser.type == 'FitzPdfParser'
assert parser.type == "FitzPdfParser"


@pytest.mark.unit
def test_serialization():
parser = Parser()
assert parser.id == Parser.model_validate_json(parser.model_dump_json()).id


@pytest.mark.unit
def test_parse():
documents = Parser().parse(r'resources/demo.pdf')
assert documents[0].resource == 'Document'
assert documents[0].content == 'This is a demo pdf \n'
assert documents[0].metadata['source'] == r'resources/demo.pdf'
documents = Parser().parse(r"resources/demo.pdf")
assert documents[0].resource == "Document"
assert documents[0].content == "This is a demo pdf \n"
assert documents[0].metadata["source"] == r"resources/demo.pdf"
1 change: 0 additions & 1 deletion pkgs/community/tests/unit/tools/PsutilTool_test.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
from unittest.mock import patch, MagicMock
import pytest
import psutil
from swarmauri_community.tools.concrete.PsutilTool import PsutilTool as Tool
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import base64
from PIL import Image
import pytest
from swarmauri_community.tools.concrete.QrCodeGeneratorTool import (
QrCodeGeneratorTool as Tool,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,11 @@

API_KEY = os.getenv("PINECONE_API_KEY")


@pytest.mark.skipif(
not os.getenv("PINECONE_API_KEY"),
reason="Skipping due to environment variable not set",
)
@pytest.mark.unit
def test_ubc_resource():
vs = PineconeVectorStore(
Expand All @@ -16,6 +21,10 @@ def test_ubc_resource():
assert vs.embedder.resource == "Embedding"


@pytest.mark.skipif(
not os.getenv("PINECONE_API_KEY"),
reason="Skipping due to environment variable not set",
)
@pytest.mark.unit
def test_ubc_type():
vs = PineconeVectorStore(
Expand All @@ -25,6 +34,11 @@ def test_ubc_type():
)
assert vs.type == "PineconeVectorStore"


@pytest.mark.skipif(
not os.getenv("PINECONE_API_KEY"),
reason="Skipping due to environment variable not set",
)
@pytest.mark.unit
def test_serialization():
vs = PineconeVectorStore(
Expand All @@ -35,6 +49,10 @@ def test_serialization():
assert vs.id == PineconeVectorStore.model_validate_json(vs.model_dump_json()).id


@pytest.mark.skipif(
not os.getenv("PINECONE_API_KEY"),
reason="Skipping due to environment variable not set",
)
@pytest.mark.unit
def test_top_k():
vs = PineconeVectorStore(
Expand All @@ -52,6 +70,3 @@ def test_top_k():

vs.add_documents(documents)
assert len(vs.retrieve(query="test", top_k=2)) == 2



Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,7 @@
)


@pytest.mark.unit
def test_ubc_resource():
assert SpatialDocEmbedding().resource == "Embedding"

@pytest.mark.xfail('Expected to fail until we fix the bug.')
@pytest.mark.xfail(reason="Expected to fail until we fix the bug.")
def test_fit_transform():
embedder = SpatialDocEmbedding()
embedder.fit_transform(["test", "test1", "test2"])
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,11 @@
)


@pytest.mark.unit
def test_ubc_resource():
assert SpatialDocEmbedding().resource == "Embedding"


@pytest.mark.unit
def test_ubc_type():
assert SpatialDocEmbedding().type == "SpatialDocEmbedding"
Expand Down
Loading