Skip to content

[Integration][Bitbucket] Added support for file kind and file entity processing #1517

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 27 commits into from
Apr 8, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
6b3b647
Added support for file kind
oiadebayo Mar 18, 2025
a013fbb
Fixed issue with parsed result type
oiadebayo Mar 24, 2025
72b5b12
Improved path matching and added test
oiadebayo Mar 26, 2025
0f27425
Added file entity processing
oiadebayo Mar 26, 2025
52dbd2c
Fixed client and test
oiadebayo Mar 26, 2025
ae213f2
fix lint
oiadebayo Mar 26, 2025
da1d3f7
Merge branch 'main' into PORT-13527-add-support-for-file-kind-latest
oiadebayo Mar 26, 2025
906aa3a
Merge branch 'main' into PORT-13527-add-support-for-file-kind-latest
oiadebayo Mar 27, 2025
6067b5b
Merge branch 'main' into PORT-13527-add-support-for-file-kind-latest
oiadebayo Mar 27, 2025
220edf6
Merge branch 'main' into PORT-13527-add-support-for-file-kind-latest
oiadebayo Mar 27, 2025
518263f
Updated file kind to use search api
oiadebayo Mar 28, 2025
a95d01b
Merge branch 'main' into PORT-13527-add-support-for-file-kind-latest
oiadebayo Apr 2, 2025
87d23f3
Merge branch 'main' into PORT-13527-add-support-for-file-kind-latest
oiadebayo Apr 4, 2025
6f23584
Attended to comments
oiadebayo Apr 4, 2025
5f3b207
Remove default max depth
oiadebayo Apr 6, 2025
0d18346
Merge branch 'main' into PORT-13527-add-support-for-file-kind-latest
oiadebayo Apr 8, 2025
485c5a8
Attended to review comment
oiadebayo Apr 8, 2025
143acd1
Fixed failing test
oiadebayo Apr 8, 2025
f8e3bc2
Update poetry.lock
oiadebayo Apr 8, 2025
d4947c6
Update integrations/bitbucket-cloud/bitbucket_cloud/helpers/file_kind.py
oiadebayo Apr 8, 2025
0dfc2d3
Attended to review comments
oiadebayo Apr 8, 2025
cedddaf
fixing lint
oiadebayo Apr 8, 2025
da62ae3
Update CHANGELOG.md
oiadebayo Apr 8, 2025
1a1f4ac
Update port-app-config.yml
oiadebayo Apr 8, 2025
2006d8c
Attended to more comments
oiadebayo Apr 8, 2025
1b08008
Update pyproject.toml
oiadebayo Apr 8, 2025
db66b1c
Merge branch 'main' into PORT-13527-add-support-for-file-kind-latest
mk-armah Apr 8, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -25,5 +25,6 @@ resources:
properties:
url: ".links.html.href"
defaultBranch: .mainbranch.name
readme: file://README.md
relations:
project: '.project.uuid | gsub("[{-}]"; "")'
8 changes: 8 additions & 0 deletions integrations/bitbucket-cloud/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

<!-- towncrier release notes start -->

## 0.1.5 (2025-04-08)


### Features

- Added support for ingesting file kind and parsing JSON and YAML files


## 0.1.4 (2025-04-07)


Expand Down
57 changes: 46 additions & 11 deletions integrations/bitbucket-cloud/bitbucket_cloud/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,23 +69,22 @@ async def _send_api_request(
params: Optional[dict[str, Any]] = None,
json_data: Optional[dict[str, Any]] = None,
method: str = "GET",
return_full_response: bool = False,
) -> Any:
"""Send request to Bitbucket API with error handling."""
response = await self.client.request(
method=method, url=url, params=params, json=json_data
)
try:
response.raise_for_status()
return response.json()
return response if return_full_response else response.json()
except HTTPStatusError as e:
error_data = e.response.json()
error_message = error_data.get("error", {}).get("message", str(e))
if e.response.status_code == 404:
logger.error(
f"Requested resource not found: {url}; message: {error_message}"
logger.warning(
f"Requested resource not found: {url}; message: {str(e)}"
)
return {}
logger.error(f"Bitbucket API error: {error_message}")
logger.error(f"Bitbucket API error: {str(e)}")
raise e
except HTTPError as e:
logger.error(f"Failed to send {method} request to url {url}: {str(e)}")
Expand Down Expand Up @@ -166,13 +165,19 @@ async def get_repositories(
yield repos

async def get_directory_contents(
self, repo_slug: str, branch: str, path: str, max_depth: int = 2
self,
repo_slug: str,
branch: str,
path: str,
max_depth: int,
params: Optional[dict[str, Any]] = None,
) -> AsyncGenerator[list[dict[str, Any]], None]:
"""Get contents of a directory."""
params = {
"max_depth": max_depth,
"pagelen": PAGE_SIZE,
}
if params is None:
params = {
"max_depth": max_depth,
"pagelen": PAGE_SIZE,
}
async for contents in self._fetch_paginated_api_with_rate_limiter(
f"{self.base_url}/repositories/{self.workspace}/{repo_slug}/src/{branch}/{path}",
params=params,
Expand Down Expand Up @@ -212,3 +217,33 @@ async def get_repository(self, repo_slug: str) -> dict[str, Any]:
return await self._send_api_request(
f"{self.base_url}/repositories/{self.workspace}/{repo_slug}"
)

async def get_repository_files(self, repo: str, branch: str, path: str) -> Any:
"""Get the content of a file."""
response = await self._send_api_request(
f"{self.base_url}/repositories/{self.workspace}/{repo}/src/{branch}/{path}",
method="GET",
return_full_response=True,
)
logger.info(f"Retrieved file content for {repo}/{branch}/{path}")
return response.text

async def search_files(
self,
search_query: str,
) -> AsyncGenerator[list[dict[str, Any]], None]:
"""Search for files using Bitbucket's search API."""
params = {
"pagelen": 300,
"search_query": search_query,
"fields": "+values.file.commit.repository.mainbranch.name",
}

async for results in self._send_paginated_api_request(
f"{self.base_url}/workspaces/{self.workspace}/search/code",
params=params,
):
logger.info(
f"Fetched batch of {len(results)} matching files from workspace {self.workspace}"
)
yield results
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
from typing import Any, Optional
from loguru import logger
from port_ocean.core.handlers import JQEntityProcessor
from initialize_client import init_client


FILE_PROPERTY_PREFIX = "file://"


class FileEntityProcessor(JQEntityProcessor):
prefix = FILE_PROPERTY_PREFIX

async def _get_file_content(
self, repo_slug: str, ref: str, file_path: str
) -> Optional[Any]:
"""Helper method to fetch and process file content."""
try:
bitbucket_client = init_client()
return await bitbucket_client.get_repository_files(
repo_slug, ref, file_path
)
except Exception as e:
logger.error(
f"Failed to get file content for {file_path} in repository {repo_slug} in branch {ref}: {e}"
)
return None

async def _search(self, data: dict[str, Any], pattern: str) -> Any:
"""
Search for a file in the repository and return its content.

Args:
data (dict[str, Any]): The data containing the repository information
pattern (str): The pattern to search for (e.g. "file://path/to/file.yaml")

For monorepo, the data should contain a "repo" key and a "folder" key with the repository information.
For non-monorepo, the data should contain the repository information directly.

Returns:
Any: The raw or parsed content of the file
"""

repo_data = data.get("repo", data)
repo_slug = repo_data.get("name", "")
default_branch = repo_data.get("mainbranch", {}).get("name", "main")

if current_directory_path := data.get("folder", {}).get("path", ""):
file_path = f"{current_directory_path}/{pattern.replace(self.prefix, '')}"
ref = data.get("folder", {}).get("commit", {}).get("hash", default_branch)
else:
file_path = pattern.replace(self.prefix, "")
if not default_branch:
logger.info(
f"No default branch found for repository {repo_slug} and file path {file_path}"
)
return None
ref = default_branch

if not repo_slug:
logger.info(
f"No repository slug found for branch {ref} and file path {file_path}"
)
return None

logger.info(
f"Searching for file {file_path} in Repository {repo_slug}, ref {ref}"
)
return await self._get_file_content(repo_slug, ref, file_path)
153 changes: 153 additions & 0 deletions integrations/bitbucket-cloud/bitbucket_cloud/helpers/file_kind.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
import fnmatch
import json
from typing import Dict, List, Any, AsyncGenerator
from loguru import logger
import yaml
from integration import BitbucketFilePattern
from port_ocean.utils.async_iterators import stream_async_iterators_tasks
from initialize_client import init_client


JSON_FILE_SUFFIX = ".json"
YAML_FILE_SUFFIX = (".yaml", ".yml")


def build_search_terms(
filename: str, repos: List[str] | None, path: str, extension: str
) -> str:
"""
This function builds search terms for Bitbucket's search API.
The entire workspace is searched for the filename if repos is not provided.
If repos are provided, only the repos specified are searched.
The path and extension are required to tailor the search so results
are relevant to the file kind.

Args:
filename (str): The filename to search for.
repos (List[str] | None): The repositories to search in.
path (str): The path to search in.
extension (str): The extension to search for.

Returns:
str: The search terms for Bitbucket's search API.
"""
search_terms = [f'"{filename}"']
if repos:
repo_filters = " ".join(f"repo:{repo}" for repo in repos)
search_terms.append(f"{repo_filters}")

search_terms.append(f"path:{path}")

if extension:
search_terms.append(f"ext:{extension}")

return " ".join(search_terms)


async def process_file_patterns(
file_pattern: BitbucketFilePattern,
) -> AsyncGenerator[List[Dict[str, Any]], None]:
"""Process file patterns and retrieve matching files using Bitbucket's search API."""
logger.info(
f"Searching for files in {len(file_pattern.repos) if file_pattern.repos else 'all'} repositories with pattern: {file_pattern.path}"
)

if not file_pattern.repos:
logger.warning("No repositories provided, searching entire workspace")
if not file_pattern.path:
logger.info("Path is required, skipping file search")
return
if not file_pattern.filenames:
logger.info("No filenames provided, skipping file search")
return

for filename in file_pattern.filenames:
search_query = build_search_terms(
filename=filename,
repos=file_pattern.repos,
path=file_pattern.path,
extension=filename.split(".")[-1],
)
logger.debug(f"Constructed search query: {search_query}")
bitbucket_client = init_client()
async for search_results in bitbucket_client.search_files(search_query):
tasks = []
for result in search_results:
if len(result["path_matches"]) >= 1:
file_info = result["file"]
file_path = file_info["path"]

if not validate_file_match(file_path, filename, file_pattern.path):
logger.debug(
f"Skipping file {file_path} as it doesn't match expected patterns"
)
continue

tasks.append(retrieve_file_content(file_info))

async for file_results in stream_async_iterators_tasks(*tasks):
if not file_pattern.skip_parsing:
file_results = parse_file(file_results)
yield [file_results]


async def retrieve_file_content(
file_info: Dict[str, Any],
) -> AsyncGenerator[Dict[str, Any], None]:
"""
Retrieve the content of a single file from Bitbucket.

Args:
file_info (Dict[str, Any]): Information about the file to retrieve

Yields:
Dict[str, Any]: Dictionary containing the file content and metadata
"""
file_path = file_info.get("path", "")
repo_info = file_info["commit"]["repository"]
repo_slug = repo_info["name"]
branch = repo_info["mainbranch"]["name"]

logger.info(f"Retrieving contents for file: {file_path}")
bitbucket_client = init_client()
file_content = await bitbucket_client.get_repository_files(
repo_slug, branch, file_path
)

yield {
"content": file_content,
"repo": repo_info,
"branch": branch,
"metadata": file_info,
}


def parse_file(file: Dict[str, Any]) -> Dict[str, Any]:
"""Parse a file based on its extension."""
try:
file_path = file.get("metadata", {}).get("path", "")
file_content = file.get("content", "")
if file_path.endswith(JSON_FILE_SUFFIX):
loaded_file = json.loads(file_content)
file["content"] = loaded_file
elif file_path.endswith(YAML_FILE_SUFFIX):
loaded_file = yaml.safe_load(file_content)
file["content"] = loaded_file
return file
except Exception as e:
logger.error(f"Error parsing file: {e}")
return file


def validate_file_match(file_path: str, filename: str, expected_path: str) -> bool:
"""Validate if the file path and filename match the expected patterns."""
if not file_path.endswith(filename):
return False

if (not expected_path or expected_path == "/") and file_path == filename:
return True

dir_path = file_path[: -len(filename)]
dir_path = dir_path.rstrip("/")
expected_path = expected_path.rstrip("/")
return fnmatch.fnmatch(dir_path, expected_path)
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ class ObjectKind(StrEnum):
FOLDER = "folder"
REPOSITORY = "repository"
PULL_REQUEST = "pull-request"
FILE = "file"


@dataclass
Expand Down
Loading