mirror of
https://github.com/xtekky/gpt4free.git
synced 2025-10-21 23:39:30 +08:00
refactor: improve media rendering and response formatting with precise changes
- Modified g4f/providers/response.py to ensure format_images_markdown returns the result directly without additional flags in the 'format_images_markdown' function. - Updated g4f/gui/server/api.py to add 'tempfiles' parameter with default empty list to '_create_response_stream' method. - Changed or added code in API response handling to iterate over 'tempfiles' and attempt to remove each file after response completion, with exception handling (try-except block with logger.exception). - Adjusted g4f/Tools/files.py to fix tempfile creation: corrected the 'suffix' parameter in 'get_tempfile' to use 'suffix' directly instead of splitting. - In g4f/tools/media.py, changed 'render_part' function to handle 'text' key properly, checking 'part.get("text")' and returning a dictionary with 'type': 'text' and 'text': value, if present.
This commit is contained in:
@@ -6,6 +6,7 @@ import asyncio
|
||||
|
||||
from ..typing import AsyncResult, Messages
|
||||
from ..providers.response import ImageResponse
|
||||
from ..image import use_aspect_ratio
|
||||
from .base_provider import AsyncGeneratorProvider, ProviderModelMixin
|
||||
|
||||
|
||||
@@ -32,10 +33,18 @@ class ImageLabs(AsyncGeneratorProvider, ProviderModelMixin):
|
||||
# Image
|
||||
prompt: str = None,
|
||||
negative_prompt: str = "",
|
||||
width: int = 1152,
|
||||
height: int = 896,
|
||||
aspect_ratio: str = "1:1",
|
||||
width: int = None,
|
||||
height: int = None,
|
||||
extra_body: dict = {},
|
||||
**kwargs
|
||||
) -> AsyncResult:
|
||||
extra_body = use_aspect_ratio({
|
||||
"width": width,
|
||||
"height": height,
|
||||
**extra_body
|
||||
}, aspect_ratio)
|
||||
|
||||
headers = {
|
||||
'accept': '*/*',
|
||||
'accept-language': 'en-US,en;q=0.9',
|
||||
@@ -56,13 +65,12 @@ class ImageLabs(AsyncGeneratorProvider, ProviderModelMixin):
|
||||
"seed": str(int(time.time())),
|
||||
"subseed": str(int(time.time() * 1000)),
|
||||
"attention": 0,
|
||||
"width": width,
|
||||
"height": height,
|
||||
"tiling": False,
|
||||
"negative_prompt": negative_prompt,
|
||||
"reference_image": "",
|
||||
"reference_image_type": None,
|
||||
"reference_strength": 30
|
||||
"reference_strength": 30,
|
||||
**extra_body
|
||||
}
|
||||
|
||||
async with session.post(f'{cls.url}/txt2img', json=payload, proxy=proxy) as generate_response:
|
||||
|
@@ -359,18 +359,19 @@ class PollinationsAI(AsyncGeneratorProvider, ProviderModelMixin):
|
||||
return f"{url}&seed={seed}" if seed else url
|
||||
async with ClientSession(headers=DEFAULT_HEADERS, connector=get_connector(proxy=proxy)) as session:
|
||||
responses = set()
|
||||
responses.add(Reasoning(status=f"Generating {n} {'image' if n == 1 else 'images'}"))
|
||||
finished = 0
|
||||
start = time.time()
|
||||
async def get_image(responses: set, i: int, seed: Optional[int] = None):
|
||||
nonlocal finished
|
||||
start = time.time()
|
||||
async with session.get(get_image_url(i, seed), allow_redirects=False, headers={"referer": referrer}) as response:
|
||||
try:
|
||||
await raise_for_status(response)
|
||||
except Exception as e:
|
||||
debug.error(f"Error fetching image: {e}")
|
||||
responses.add(Reasoning(status=f"Image #{i+1} generated in {time.time() - start:.2f}s"))
|
||||
responses.add(ImageResponse(str(response.url), prompt))
|
||||
finished += 1
|
||||
responses.add(Reasoning(status=f"Image {finished}/{n} generated in {time.time() - start:.2f}s"))
|
||||
tasks = []
|
||||
for i in range(int(n)):
|
||||
tasks.append(asyncio.create_task(get_image(responses, i, seed)))
|
||||
@@ -426,6 +427,8 @@ class PollinationsAI(AsyncGeneratorProvider, ProviderModelMixin):
|
||||
**extra_body
|
||||
)
|
||||
async with session.post(url, json=data, headers={"referer": referrer}) as response:
|
||||
if response.status == 400:
|
||||
debug.error(f"Error: 400 - Bad Request: {data}")
|
||||
await raise_for_status(response)
|
||||
if response.headers["content-type"].startswith("text/plain"):
|
||||
yield await response.text()
|
||||
@@ -492,6 +495,6 @@ class PollinationsAI(AsyncGeneratorProvider, ProviderModelMixin):
|
||||
if finish_reason:
|
||||
yield FinishReason(finish_reason)
|
||||
else:
|
||||
async for chunk in save_response_media(response, format_image_prompt(messages), [model, extra_parameters.get("audio", {}).get("voice")]):
|
||||
async for chunk in save_response_media(response, format_image_prompt(messages), [model, extra_body.get("audio", {}).get("voice")]):
|
||||
yield chunk
|
||||
return
|
||||
|
@@ -5,7 +5,7 @@ import asyncio
|
||||
from typing import Any
|
||||
|
||||
try:
|
||||
from markitdown import MarkItDown as MaItDo, StreamInfo
|
||||
from ...integration.markitdown import MarkItDown as MaItDo, StreamInfo
|
||||
has_markitdown = True
|
||||
except ImportError:
|
||||
has_markitdown = False
|
||||
|
@@ -146,7 +146,7 @@ class Api:
|
||||
**kwargs
|
||||
}
|
||||
|
||||
def _create_response_stream(self, kwargs: dict, provider: str, download_media: bool = True) -> Iterator:
|
||||
def _create_response_stream(self, kwargs: dict, provider: str, download_media: bool = True, tempfiles: list[str] = []) -> Iterator:
|
||||
def decorated_log(text: str, file = None):
|
||||
debug.logs.append(text)
|
||||
if debug.logging:
|
||||
@@ -163,7 +163,7 @@ class Api:
|
||||
has_images="media" in kwargs,
|
||||
)
|
||||
except Exception as e:
|
||||
debug.error(e)
|
||||
logger.exception(e)
|
||||
yield self._format_json('error', type(e).__name__, message=get_error_message(e))
|
||||
return
|
||||
if not isinstance(provider_handler, BaseRetryProvider):
|
||||
@@ -198,7 +198,7 @@ class Api:
|
||||
tags = [model, kwargs.get("aspect_ratio"), kwargs.get("resolution"), kwargs.get("width"), kwargs.get("height")]
|
||||
media = asyncio.run(copy_media(chunk.get_list(), chunk.get("cookies"), chunk.get("headers"), proxy=proxy, alt=chunk.alt, tags=tags))
|
||||
media = ImageResponse(media, chunk.alt) if isinstance(chunk, ImageResponse) else VideoResponse(media, chunk.alt)
|
||||
yield self._format_json("content", str(media), urls=chunk.urls, alt=chunk.alt)
|
||||
yield self._format_json("content", str(media), urls=media.urls, alt=media.alt)
|
||||
elif isinstance(chunk, SynthesizeData):
|
||||
yield self._format_json("synthesize", chunk.get_dict())
|
||||
elif isinstance(chunk, TitleGeneration):
|
||||
@@ -232,6 +232,11 @@ class Api:
|
||||
yield self._format_json('error', type(e).__name__, message=get_error_message(e))
|
||||
finally:
|
||||
yield from self._yield_logs()
|
||||
for tempfile in tempfiles:
|
||||
try:
|
||||
os.remove(tempfile)
|
||||
except Exception as e:
|
||||
logger.exception(e)
|
||||
|
||||
def _yield_logs(self):
|
||||
if debug.logs:
|
||||
@@ -252,8 +257,6 @@ class Api:
|
||||
}
|
||||
|
||||
def handle_provider(self, provider_handler, model):
|
||||
if isinstance(provider_handler, BaseRetryProvider) and provider_handler.last_provider is not None:
|
||||
provider_handler = provider_handler.last_provider
|
||||
if model:
|
||||
return self._format_json("provider", {**provider_handler.get_dict(), "model": model})
|
||||
return self._format_json("provider", provider_handler.get_dict())
|
||||
|
@@ -8,8 +8,7 @@ import asyncio
|
||||
import shutil
|
||||
import random
|
||||
import datetime
|
||||
import tempfile
|
||||
from flask import Flask, Response, redirect, request, jsonify, render_template, send_from_directory
|
||||
from flask import Flask, Response, redirect, request, jsonify, send_from_directory
|
||||
from werkzeug.exceptions import NotFound
|
||||
from typing import Generator
|
||||
from pathlib import Path
|
||||
@@ -17,19 +16,20 @@ from urllib.parse import quote_plus
|
||||
from hashlib import sha256
|
||||
|
||||
try:
|
||||
from markitdown import MarkItDown
|
||||
from ...integration.markitdown import MarkItDown, StreamInfo
|
||||
has_markitdown = True
|
||||
except ImportError:
|
||||
except ImportError as e:
|
||||
print(e)
|
||||
has_markitdown = False
|
||||
|
||||
from ...client.service import convert_to_provider
|
||||
from ...providers.asyncio import to_sync_generator
|
||||
from ...providers.response import FinishReason
|
||||
from ...client.helper import filter_markdown
|
||||
from ...tools.files import supports_filename, get_streaming, get_bucket_dir, get_buckets
|
||||
from ...tools.files import supports_filename, get_streaming, get_bucket_dir, get_tempfile
|
||||
from ...tools.run_tools import iter_run_tools
|
||||
from ...errors import ProviderNotFoundError
|
||||
from ...image import is_allowed_extension
|
||||
from ...image import is_allowed_extension, MEDIA_TYPE_MAP
|
||||
from ...cookies import get_cookies_dir
|
||||
from ...image.copy_images import secure_filename, get_source_url, get_media_dir
|
||||
from ... import ChatCompletion
|
||||
@@ -79,9 +79,7 @@ class Backend_Api(Api):
|
||||
@app.route('/backend-api/v2/providers', methods=['GET'])
|
||||
def jsonify_providers(**kwargs):
|
||||
response = self.get_providers(**kwargs)
|
||||
if isinstance(response, list):
|
||||
return jsonify(response)
|
||||
return response
|
||||
|
||||
def get_demo_models():
|
||||
return [{
|
||||
@@ -91,7 +89,7 @@ class Backend_Api(Api):
|
||||
"audio": isinstance(model, models.AudioModel),
|
||||
"video": isinstance(model, models.VideoModel),
|
||||
"providers": [
|
||||
getattr(provider, "parent", provider.__name__)
|
||||
provider.get_parent()
|
||||
for provider in providers
|
||||
],
|
||||
"demo": True
|
||||
@@ -109,13 +107,14 @@ class Backend_Api(Api):
|
||||
json_data = json.loads(request.form['json'])
|
||||
else:
|
||||
json_data = request.json
|
||||
tempfiles = []
|
||||
if "files" in request.files:
|
||||
media = []
|
||||
for file in request.files.getlist('files'):
|
||||
if file.filename != '' and is_allowed_extension(file.filename):
|
||||
newfile = tempfile.TemporaryFile()
|
||||
shutil.copyfileobj(file.stream, newfile)
|
||||
media.append((newfile, file.filename))
|
||||
newfile = get_tempfile(file)
|
||||
tempfiles.append(newfile)
|
||||
media.append((Path(newfile), file.filename))
|
||||
json_data['media'] = media
|
||||
|
||||
if app.demo and not json_data.get("provider"):
|
||||
@@ -130,6 +129,7 @@ class Backend_Api(Api):
|
||||
kwargs,
|
||||
json_data.get("provider"),
|
||||
json_data.get("download_media", True),
|
||||
tempfiles
|
||||
),
|
||||
mimetype='text/event-stream'
|
||||
)
|
||||
@@ -306,41 +306,46 @@ class Backend_Api(Api):
|
||||
filenames = []
|
||||
media = []
|
||||
for file in request.files.getlist('files'):
|
||||
# Copy the file to a temporary location
|
||||
filename = secure_filename(file.filename)
|
||||
copyfile = tempfile.NamedTemporaryFile(suffix=filename, delete=False)
|
||||
shutil.copyfileobj(file.stream, copyfile)
|
||||
copyfile.close()
|
||||
file.stream.close()
|
||||
|
||||
mimetype = file.mimetype.split(";")[0]
|
||||
if (not filename or filename == "blob") and mimetype in MEDIA_TYPE_MAP:
|
||||
filename = f"file.{MEDIA_TYPE_MAP[mimetype]}"
|
||||
suffix = os.path.splitext(filename)[1].lower()
|
||||
copyfile = get_tempfile(file, suffix)
|
||||
result = None
|
||||
if has_markitdown:
|
||||
try:
|
||||
language = request.headers.get("x-recognition-language")
|
||||
md = MarkItDown()
|
||||
result = md.convert(copyfile.name).text_content
|
||||
result = md.convert(copyfile, stream_info=StreamInfo(
|
||||
extension=suffix,
|
||||
mimetype=file.mimetype,
|
||||
), language=language).text_content
|
||||
except Exception as e:
|
||||
logger.exception(e)
|
||||
is_media = is_allowed_extension(filename)
|
||||
is_supported = supports_filename(filename)
|
||||
if not is_media and not is_supported:
|
||||
os.remove(copyfile)
|
||||
continue
|
||||
if not is_media and result:
|
||||
with open(os.path.join(bucket_dir, f"{filename}.md"), 'w') as f:
|
||||
f.write(f"{result}\n")
|
||||
filenames.append(f"{filename}.md")
|
||||
except Exception as e:
|
||||
logger.exception(e)
|
||||
if not result:
|
||||
if is_allowed_extension(filename):
|
||||
if is_media:
|
||||
os.makedirs(media_dir, exist_ok=True)
|
||||
newfile = os.path.join(media_dir, filename)
|
||||
media.append(filename)
|
||||
elif supports_filename(filename):
|
||||
media.append({"name": filename, "text": result})
|
||||
elif not result and supports_filename(filename):
|
||||
newfile = os.path.join(bucket_dir, filename)
|
||||
filenames.append(filename)
|
||||
else:
|
||||
os.remove(copyfile.name)
|
||||
continue
|
||||
try:
|
||||
os.rename(copyfile.name, newfile)
|
||||
os.rename(copyfile, newfile)
|
||||
except OSError:
|
||||
shutil.copyfile(copyfile.name, newfile)
|
||||
os.remove(copyfile.name)
|
||||
shutil.copyfile(copyfile, newfile)
|
||||
os.remove(copyfile)
|
||||
with open(os.path.join(bucket_dir, "files.txt"), 'w') as f:
|
||||
[f.write(f"{filename}\n") for filename in filenames]
|
||||
f.write("".join([f"{filename}\n" for filename in filenames]))
|
||||
return {"bucket_id": bucket_id, "files": filenames, "media": media}
|
||||
|
||||
@app.route('/files/<bucket_id>/media/<filename>', methods=['GET'])
|
||||
|
@@ -38,6 +38,7 @@ EXTENSIONS_MAP: dict[str, str] = {
|
||||
}
|
||||
|
||||
MEDIA_TYPE_MAP: dict[str, str] = {value: key for key, value in EXTENSIONS_MAP.items()}
|
||||
MEDIA_TYPE_MAP["audio/webm"] = "webm"
|
||||
|
||||
def to_image(image: ImageType, is_svg: bool = False) -> Image:
|
||||
"""
|
||||
@@ -111,7 +112,7 @@ def is_data_an_audio(data_uri: str = None, filename: str = None) -> str:
|
||||
extension = get_extension(filename)
|
||||
if extension is not None:
|
||||
media_type = EXTENSIONS_MAP[extension]
|
||||
if media_type.startswith("audio/"):
|
||||
if media_type.startswith("audio/") or media_type == "video/webm":
|
||||
return media_type
|
||||
if isinstance(data_uri, str):
|
||||
audio_format = re.match(r'^data:(audio/\w+);base64,', data_uri)
|
||||
|
120
g4f/integration/markitdown/__init__.py
Normal file
120
g4f/integration/markitdown/__init__.py
Normal file
@@ -0,0 +1,120 @@
|
||||
import re
|
||||
import sys
|
||||
from typing import List, Union, BinaryIO
|
||||
from markitdown import MarkItDown as BaseMarkItDown
|
||||
from markitdown._stream_info import StreamInfo
|
||||
from markitdown._base_converter import DocumentConverterResult
|
||||
|
||||
from markitdown._exceptions import (
|
||||
FileConversionException,
|
||||
UnsupportedFormatException,
|
||||
FailedConversionAttempt,
|
||||
)
|
||||
|
||||
from ._audio_converter import AudioConverter
|
||||
from ._image_converter import ImageConverter
|
||||
|
||||
class MarkItDown(BaseMarkItDown):
|
||||
"""(In preview) An extremely simple text-based document reader, suitable for LLM use.
|
||||
This reader will convert common file-types or webpages to Markdown."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
self.register_converter(AudioConverter())
|
||||
self.register_converter(ImageConverter())
|
||||
|
||||
def _convert(
|
||||
self, *, file_stream: BinaryIO, stream_info_guesses: List[StreamInfo], **kwargs
|
||||
) -> DocumentConverterResult:
|
||||
res: Union[None, DocumentConverterResult] = None
|
||||
|
||||
# Keep track of which converters throw exceptions
|
||||
failed_attempts: List[FailedConversionAttempt] = []
|
||||
|
||||
# Create a copy of the page_converters list, sorted by priority.
|
||||
# We do this with each call to _convert because the priority of converters may change between calls.
|
||||
# The sort is guaranteed to be stable, so converters with the same priority will remain in the same order.
|
||||
sorted_registrations = sorted(self._converters, key=lambda x: x.priority)
|
||||
|
||||
# Remember the initial stream position so that we can return to it
|
||||
cur_pos = file_stream.tell()
|
||||
|
||||
for stream_info in stream_info_guesses + [StreamInfo()]:
|
||||
for converter_registration in sorted_registrations:
|
||||
converter = converter_registration.converter
|
||||
# Sanity check -- make sure the cur_pos is still the same
|
||||
assert (
|
||||
cur_pos == file_stream.tell()
|
||||
), f"File stream position should NOT change between guess iterations"
|
||||
|
||||
_kwargs = {k: v for k, v in kwargs.items()}
|
||||
|
||||
# Copy any additional global options
|
||||
if "llm_client" not in _kwargs and self._llm_client is not None:
|
||||
_kwargs["llm_client"] = self._llm_client
|
||||
|
||||
if "llm_model" not in _kwargs and self._llm_model is not None:
|
||||
_kwargs["llm_model"] = self._llm_model
|
||||
|
||||
if "style_map" not in _kwargs and self._style_map is not None:
|
||||
_kwargs["style_map"] = self._style_map
|
||||
|
||||
if "exiftool_path" not in _kwargs and self._exiftool_path is not None:
|
||||
_kwargs["exiftool_path"] = self._exiftool_path
|
||||
|
||||
# Add the list of converters for nested processing
|
||||
_kwargs["_parent_converters"] = self._converters
|
||||
|
||||
# Add legaxy kwargs
|
||||
if stream_info is not None:
|
||||
if stream_info.extension is not None:
|
||||
_kwargs["file_extension"] = stream_info.extension
|
||||
|
||||
if stream_info.url is not None:
|
||||
_kwargs["url"] = stream_info.url
|
||||
|
||||
# Check if the converter will accept the file, and if so, try to convert it
|
||||
_accepts = False
|
||||
try:
|
||||
_accepts = converter.accepts(file_stream, stream_info, **_kwargs)
|
||||
except NotImplementedError:
|
||||
pass
|
||||
|
||||
# accept() should not have changed the file stream position
|
||||
assert (
|
||||
cur_pos == file_stream.tell()
|
||||
), f"{type(converter).__name__}.accept() should NOT change the file_stream position"
|
||||
|
||||
# Attempt the conversion
|
||||
if _accepts:
|
||||
try:
|
||||
res = converter.convert(file_stream, stream_info, **_kwargs)
|
||||
except Exception:
|
||||
failed_attempts.append(
|
||||
FailedConversionAttempt(
|
||||
converter=converter, exc_info=sys.exc_info()
|
||||
)
|
||||
)
|
||||
finally:
|
||||
file_stream.seek(cur_pos)
|
||||
|
||||
if res is not None:
|
||||
if isinstance(res.text_content, str):
|
||||
# Normalize the content
|
||||
res.text_content = "\n".join(
|
||||
[line.rstrip() for line in re.split(r"\r?\n", res.text_content)]
|
||||
)
|
||||
res.text_content = re.sub(r"\n{3,}", "\n\n", res.text_content)
|
||||
return res
|
||||
|
||||
# If we got this far without success, report any exceptions
|
||||
if len(failed_attempts) > 0:
|
||||
raise FileConversionException(attempts=failed_attempts)
|
||||
|
||||
# Nothing can handle it!
|
||||
raise UnsupportedFormatException(
|
||||
f"Could not convert stream to Markdown. No converter attempted a conversion, suggesting that the filetype is simply not supported."
|
||||
)
|
105
g4f/integration/markitdown/_audio_converter.py
Normal file
105
g4f/integration/markitdown/_audio_converter.py
Normal file
@@ -0,0 +1,105 @@
|
||||
from typing import Any, BinaryIO
|
||||
|
||||
from markitdown.converters._exiftool import exiftool_metadata
|
||||
from markitdown._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from markitdown._stream_info import StreamInfo
|
||||
from markitdown._exceptions import MissingDependencyException
|
||||
|
||||
from ._transcribe_audio import transcribe_audio
|
||||
|
||||
ACCEPTED_MIME_TYPE_PREFIXES = [
|
||||
"audio/x-wav",
|
||||
"audio/mpeg",
|
||||
"video/mp4",
|
||||
"video/webm",
|
||||
"audio/webm",
|
||||
]
|
||||
|
||||
ACCEPTED_FILE_EXTENSIONS = [
|
||||
".wav",
|
||||
".mp3",
|
||||
".m4a",
|
||||
".mp4",
|
||||
".webm",
|
||||
]
|
||||
|
||||
class AudioConverter(DocumentConverter):
|
||||
"""
|
||||
Converts audio files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed).
|
||||
"""
|
||||
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> bool:
|
||||
mimetype = (stream_info.mimetype or "").lower()
|
||||
extension = (stream_info.extension or "").lower()
|
||||
|
||||
if extension in ACCEPTED_FILE_EXTENSIONS:
|
||||
return True
|
||||
|
||||
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
||||
if mimetype.startswith(prefix):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def convert(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
language: str = "en-US",
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> DocumentConverterResult:
|
||||
md_content = ""
|
||||
|
||||
# Add metadata
|
||||
metadata = exiftool_metadata(
|
||||
file_stream, exiftool_path=kwargs.get("exiftool_path")
|
||||
)
|
||||
if metadata:
|
||||
for f in [
|
||||
"Title",
|
||||
"Artist",
|
||||
"Author",
|
||||
"Band",
|
||||
"Album",
|
||||
"Genre",
|
||||
"Track",
|
||||
"DateTimeOriginal",
|
||||
"CreateDate",
|
||||
# "Duration", -- Wrong values when read from memory
|
||||
"NumChannels",
|
||||
"SampleRate",
|
||||
"AvgBytesPerSec",
|
||||
"BitsPerSample",
|
||||
]:
|
||||
if f in metadata:
|
||||
md_content += f"{f}: {metadata[f]}\n"
|
||||
|
||||
# Figure out the audio format for transcription
|
||||
if stream_info.extension == ".wav" or stream_info.mimetype == "audio/x-wav":
|
||||
audio_format = "wav"
|
||||
elif stream_info.extension == ".mp3" or stream_info.mimetype == "audio/mpeg":
|
||||
audio_format = "mp3"
|
||||
elif (
|
||||
stream_info.extension in [".mp4", ".m4a"]
|
||||
or stream_info.mimetype == "video/mp4"
|
||||
):
|
||||
audio_format = "mp4"
|
||||
elif stream_info.extension == ".webm" or stream_info.mimetype in ("audio/webm", "video/webm"):
|
||||
audio_format = "webm"
|
||||
else:
|
||||
audio_format = None
|
||||
|
||||
# Transcribe
|
||||
if audio_format:
|
||||
try:
|
||||
md_content = transcribe_audio(file_stream, audio_format=audio_format, language=language)
|
||||
except MissingDependencyException:
|
||||
pass
|
||||
|
||||
# Return the result
|
||||
return DocumentConverterResult(markdown=md_content.strip())
|
10
g4f/integration/markitdown/_base_converter.py
Normal file
10
g4f/integration/markitdown/_base_converter.py
Normal file
@@ -0,0 +1,10 @@
|
||||
from typing import Awaitable
|
||||
|
||||
class AsyncDocumentConverterResult:
|
||||
"""The result of converting a document to Markdown."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
text_content: Awaitable[str],
|
||||
):
|
||||
self.text_content = text_content
|
92
g4f/integration/markitdown/_image_converter.py
Normal file
92
g4f/integration/markitdown/_image_converter.py
Normal file
@@ -0,0 +1,92 @@
|
||||
from typing import BinaryIO, Any
|
||||
import asyncio
|
||||
from markitdown._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from markitdown._stream_info import StreamInfo
|
||||
from markitdown.converters._llm_caption import llm_caption
|
||||
from markitdown.converters._exiftool import exiftool_metadata
|
||||
|
||||
from ._base_converter import AsyncDocumentConverterResult
|
||||
|
||||
ACCEPTED_MIME_TYPE_PREFIXES = [
|
||||
"image/jpeg",
|
||||
"image/png",
|
||||
]
|
||||
|
||||
ACCEPTED_FILE_EXTENSIONS = [".jpg", ".jpeg", ".png"]
|
||||
|
||||
|
||||
class ImageConverter(DocumentConverter):
|
||||
"""
|
||||
Converts images to markdown via extraction of metadata (if `exiftool` is installed), and description via a multimodal LLM (if an llm_client is configured).
|
||||
"""
|
||||
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any,
|
||||
) -> bool:
|
||||
mimetype = (stream_info.mimetype or "").lower()
|
||||
extension = (stream_info.extension or "").lower()
|
||||
|
||||
if extension in ACCEPTED_FILE_EXTENSIONS:
|
||||
return True
|
||||
|
||||
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
||||
if mimetype.startswith(prefix):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def convert(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> DocumentConverterResult:
|
||||
md_content = ""
|
||||
|
||||
# Add metadata
|
||||
metadata = exiftool_metadata(
|
||||
file_stream, exiftool_path=kwargs.get("exiftool_path")
|
||||
)
|
||||
|
||||
if metadata:
|
||||
for f in [
|
||||
"ImageSize",
|
||||
"Title",
|
||||
"Caption",
|
||||
"Description",
|
||||
"Keywords",
|
||||
"Artist",
|
||||
"Author",
|
||||
"DateTimeOriginal",
|
||||
"CreateDate",
|
||||
"GPSPosition",
|
||||
]:
|
||||
if f in metadata:
|
||||
md_content += f"{f}: {metadata[f]}\n"
|
||||
|
||||
# Try describing the image with GPT
|
||||
llm_client = kwargs.get("llm_client")
|
||||
llm_model = kwargs.get("llm_model")
|
||||
if llm_client is not None and llm_model is not None:
|
||||
llm_description = llm_caption(
|
||||
file_stream,
|
||||
stream_info,
|
||||
client=llm_client,
|
||||
model=llm_model,
|
||||
prompt=kwargs.get("llm_prompt"),
|
||||
)
|
||||
|
||||
if asyncio.iscoroutine(llm_description):
|
||||
return AsyncDocumentConverterResult(
|
||||
llm_description,
|
||||
)
|
||||
|
||||
if llm_description is not None:
|
||||
md_content += "\n# Description:\n" + llm_description.strip() + "\n"
|
||||
|
||||
return DocumentConverterResult(
|
||||
markdown=md_content,
|
||||
)
|
56
g4f/integration/markitdown/_llm_caption.py
Normal file
56
g4f/integration/markitdown/_llm_caption.py
Normal file
@@ -0,0 +1,56 @@
|
||||
from typing import BinaryIO, Union, Awaitable
|
||||
import base64
|
||||
import mimetypes
|
||||
import asyncio
|
||||
from markitdown._stream_info import StreamInfo
|
||||
|
||||
|
||||
def llm_caption(
|
||||
file_stream: BinaryIO, stream_info: StreamInfo, *, client, model, prompt=None
|
||||
) -> Union[None, str, Awaitable[str]]:
|
||||
if prompt is None or prompt.strip() == "":
|
||||
prompt = "Write a detailed caption for this image."
|
||||
|
||||
# Get the content type
|
||||
content_type = stream_info.mimetype
|
||||
if not content_type:
|
||||
content_type, _ = mimetypes.guess_type("_dummy" + (stream_info.extension or ""))
|
||||
if not content_type:
|
||||
content_type = "application/octet-stream"
|
||||
|
||||
# Convert to base64
|
||||
cur_pos = file_stream.tell()
|
||||
try:
|
||||
base64_image = base64.b64encode(file_stream.read()).decode("utf-8")
|
||||
except Exception as e:
|
||||
return None
|
||||
finally:
|
||||
file_stream.seek(cur_pos)
|
||||
|
||||
# Prepare the data-uri
|
||||
data_uri = f"data:{content_type};base64,{base64_image}"
|
||||
|
||||
# Prepare the OpenAI API request
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": prompt},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": data_uri,
|
||||
},
|
||||
},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
# Call the OpenAI API
|
||||
response = client.chat.completions.create(model=model, messages=messages)
|
||||
if asyncio.iscoroutine(response):
|
||||
async def read_content(response):
|
||||
response = await response
|
||||
return response.choices[0].message.content
|
||||
return read_content(response)
|
||||
return response.choices[0].message.content
|
49
g4f/integration/markitdown/_transcribe_audio.py
Normal file
49
g4f/integration/markitdown/_transcribe_audio.py
Normal file
@@ -0,0 +1,49 @@
|
||||
import io
|
||||
import sys
|
||||
from typing import BinaryIO
|
||||
from markitdown._exceptions import MissingDependencyException
|
||||
|
||||
# Try loading optional (but in this case, required) dependencies
|
||||
# Save reporting of any exceptions for later
|
||||
_dependency_exc_info = None
|
||||
try:
|
||||
# Suppress some warnings on library import
|
||||
import warnings
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
||||
warnings.filterwarnings("ignore", category=SyntaxWarning)
|
||||
import speech_recognition as sr
|
||||
import pydub
|
||||
except ImportError:
|
||||
# Preserve the error and stack trace for later
|
||||
_dependency_exc_info = sys.exc_info()
|
||||
|
||||
|
||||
def transcribe_audio(file_stream: BinaryIO, *, audio_format: str = "wav", language: str = "en-US") -> str:
|
||||
# Check for installed dependencies
|
||||
if _dependency_exc_info is not None:
|
||||
raise MissingDependencyException(
|
||||
"Speech transcription requires installing MarkItdown with the [audio-transcription] optional dependencies. E.g., `pip install markitdown[audio-transcription]` or `pip install markitdown[all]`"
|
||||
) from _dependency_exc_info[
|
||||
1
|
||||
].with_traceback( # type: ignore[union-attr]
|
||||
_dependency_exc_info[2]
|
||||
)
|
||||
|
||||
if audio_format in ["wav", "aiff", "flac"]:
|
||||
audio_source = file_stream
|
||||
elif audio_format in ["mp3", "mp4", "webm"]:
|
||||
audio_segment = pydub.AudioSegment.from_file(file_stream, format=audio_format)
|
||||
|
||||
audio_source = io.BytesIO()
|
||||
audio_segment.export(audio_source, format="wav")
|
||||
audio_source.seek(0)
|
||||
else:
|
||||
raise ValueError(f"Unsupported audio format: {audio_format}")
|
||||
|
||||
recognizer = sr.Recognizer()
|
||||
with sr.AudioFile(audio_source) as source:
|
||||
audio = recognizer.record(source)
|
||||
transcript = recognizer.recognize_google(audio, language=language).strip()
|
||||
return "[No speech detected]" if transcript == "" else transcript
|
@@ -114,10 +114,7 @@ def format_images_markdown(images: Union[str, List[str]], alt: str,
|
||||
)
|
||||
for idx, image in enumerate(images)
|
||||
)
|
||||
|
||||
start_flag = "<!-- generated images start -->\n"
|
||||
end_flag = "<!-- generated images end -->\n"
|
||||
return f"\n{start_flag}{result}\n{end_flag}\n"
|
||||
return result
|
||||
|
||||
class ResponseType:
|
||||
@abstractmethod
|
||||
|
@@ -586,7 +586,7 @@ async def get_async_streaming(bucket_dir: str, delete_files = False, refine_chun
|
||||
raise e
|
||||
|
||||
def get_tempfile(file, suffix):
|
||||
copyfile = tempfile.NamedTemporaryFile(suffix=os.path.splitext(suffix)[-1], delete=False)
|
||||
copyfile = tempfile.NamedTemporaryFile(suffix=suffix, delete=False)
|
||||
shutil.copyfileobj(file, copyfile)
|
||||
copyfile.close()
|
||||
file.close()
|
||||
|
@@ -24,6 +24,12 @@ def render_media(bucket_id: str, name: str, url: str, as_path: bool = False, as_
|
||||
def render_part(part: dict) -> dict:
|
||||
if "type" in part:
|
||||
return part
|
||||
text = part.get("text")
|
||||
if text:
|
||||
return {
|
||||
"type": "text",
|
||||
"text": text
|
||||
}
|
||||
filename = part.get("name")
|
||||
if (filename is None):
|
||||
bucket_dir = Path(get_bucket_dir(part.get("bucket_id")))
|
||||
|
Reference in New Issue
Block a user