444 lines
No EOL
20 KiB
PHP
444 lines
No EOL
20 KiB
PHP
<?php
|
|
include 'config.php';
|
|
include 'logger.php';
|
|
include 'azure_storage.php';
|
|
|
|
logMessage("Received request to download.php");
|
|
|
|
if (isset($_GET['document_id']) && isset($_GET['document_key']) && isset($_GET['original_filename']) && isset($_GET['target_lang'])) {
|
|
$document_id = $_GET['document_id'];
|
|
$document_key = $_GET['document_key'];
|
|
$original_filename = $_GET['original_filename'];
|
|
$target_lang = $_GET['target_lang'];
|
|
|
|
logMessage("Attempting to download Document ID: $document_id, Original filename: $original_filename, Target language: $target_lang");
|
|
|
|
// Decode the document key to get blob information
|
|
$documentKeyData = json_decode($document_key, true);
|
|
if (json_last_error() !== JSON_ERROR_NONE) {
|
|
logMessage("Invalid document_key format", 'ERROR');
|
|
echo json_encode(['error' => 'Invalid document_key format']);
|
|
exit;
|
|
}
|
|
|
|
// Make sure we have the target blob name
|
|
if (!isset($documentKeyData['target_blob'])) {
|
|
logMessage("Missing target_blob in document_key", 'ERROR');
|
|
echo json_encode(['error' => 'Missing target blob information']);
|
|
exit;
|
|
}
|
|
|
|
$targetBlobName = $documentKeyData['target_blob'];
|
|
|
|
// Set timeout for downloads
|
|
set_time_limit(60);
|
|
|
|
// Variable to store file content
|
|
$fileContent = false;
|
|
|
|
// Initialize Azure Storage Helper
|
|
$azureStorage = new AzureStorageHelper();
|
|
|
|
// Before downloading, check the status of the translation
|
|
$statusUrl = MS_API_ENDPOINT . "/translator/document/batches/$document_id?api-version=" . MS_API_VERSION;
|
|
|
|
$ch = curl_init();
|
|
curl_setopt($ch, CURLOPT_URL, $statusUrl);
|
|
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
|
|
curl_setopt($ch, CURLOPT_HTTPHEADER, [
|
|
'Ocp-Apim-Subscription-Key: ' . MS_API_KEY,
|
|
'Ocp-Apim-Subscription-Region: ' . MS_API_REGION
|
|
]);
|
|
|
|
$response = curl_exec($ch);
|
|
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
|
|
curl_close($ch);
|
|
|
|
if ($httpCode !== 200 || $response === false) {
|
|
logMessage("Failed to check translation status. HTTP Code: $httpCode", 'ERROR');
|
|
} else {
|
|
$statusData = json_decode($response, true);
|
|
|
|
if (isset($statusData['status']) && strtolower($statusData['status']) !== 'succeeded') {
|
|
logMessage("Translation status is not Succeeded: " . $statusData['status'] . ". Attempting download anyway.", 'WARNING');
|
|
} else {
|
|
logMessage("Translation has Succeeded status. Proceeding with download.");
|
|
}
|
|
}
|
|
|
|
// DIRECT APPROACH: List all files in the Azure translated container and match to our target
|
|
logMessage("USING DIRECT AZURE LISTING to find files for document ID: $document_id");
|
|
|
|
// Extract the target language and other needed info
|
|
$fileExtension = '';
|
|
if (preg_match('/\.([^\.]+)$/', $original_filename, $extMatches)) {
|
|
$fileExtension = strtolower($extMatches[1]);
|
|
}
|
|
logMessage("Original filename: $original_filename, Extension: $fileExtension, Target Language: $target_lang");
|
|
|
|
// Get info from document key
|
|
$documentKeyData = json_decode($document_key, true);
|
|
$sourceBlobName = isset($documentKeyData['source_blob']) ? $documentKeyData['source_blob'] : '';
|
|
$targetBlobName = isset($documentKeyData['target_blob']) ? $documentKeyData['target_blob'] : '';
|
|
|
|
logMessage("Source blob: $sourceBlobName, Target blob: $targetBlobName");
|
|
|
|
// Extract the basic filename without our added prefix
|
|
$baseFilename = '';
|
|
if (preg_match('/^.*?\-([A-Z]+)_(.+)$/', $targetBlobName, $matches)) {
|
|
$baseFilename = $matches[2]; // This is just the original filename
|
|
logMessage("Base filename extracted from target blob: $baseFilename");
|
|
}
|
|
|
|
// List all files in the translated container
|
|
$sasToken = defined('AZURE_STORAGE_SAS_TOKEN') ? AZURE_STORAGE_SAS_TOKEN : '';
|
|
$accountName = 'opticaltranslations'; // Hardcoding for simplicity
|
|
$containerUrl = "https://$accountName.blob.core.windows.net/translated-documents?restype=container&comp=list&$sasToken";
|
|
|
|
logMessage("Listing all files in translated container");
|
|
|
|
$ch = curl_init();
|
|
curl_setopt($ch, CURLOPT_URL, $containerUrl);
|
|
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
|
|
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
|
|
|
|
$listResponse = curl_exec($ch);
|
|
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
|
|
curl_close($ch);
|
|
|
|
$fileContent = false;
|
|
$directDownloadUrl = null;
|
|
|
|
if ($listResponse !== false && $httpCode < 400) {
|
|
logMessage("Successfully listed container contents");
|
|
|
|
// Parse XML
|
|
$xml = simplexml_load_string($listResponse);
|
|
if ($xml && isset($xml->Blobs->Blob)) {
|
|
$potentialMatches = [];
|
|
$exactMatches = [];
|
|
|
|
// Go through all blobs and find potential matches
|
|
foreach ($xml->Blobs->Blob as $blob) {
|
|
$name = (string)$blob->Name;
|
|
$lastModified = (string)$blob->Properties->{'Last-Modified'};
|
|
$timestamp = strtotime($lastModified);
|
|
$size = (int)$blob->Properties->{'Content-Length'};
|
|
|
|
logMessage("Found blob: $name (size: $size bytes, modified: $lastModified)");
|
|
|
|
// Check for exact match - this is the file we uploaded
|
|
if ($name === $targetBlobName) {
|
|
logMessage("EXACT MATCH FOUND: $name");
|
|
$exactMatches[$name] = $timestamp;
|
|
}
|
|
|
|
// Check for matching language code
|
|
if ((strpos($name, strtolower($target_lang)) !== false ||
|
|
strpos($name, strtoupper($target_lang)) !== false) &&
|
|
$size > 0) {
|
|
logMessage("Language match found: $name");
|
|
$potentialMatches[$name] = $timestamp;
|
|
}
|
|
|
|
// Check for matching original filename (ignoring language prefix)
|
|
if (!empty($baseFilename) && strpos($name, $baseFilename) !== false && $size > 0) {
|
|
logMessage("Filename match found: $name");
|
|
$potentialMatches[$name] = $timestamp;
|
|
}
|
|
|
|
// Check for matching file extension
|
|
if (!empty($fileExtension) &&
|
|
(strpos($name, ".$fileExtension") !== false) &&
|
|
$size > 0) {
|
|
logMessage("Extension match found: $name");
|
|
$potentialMatches[$name] = $timestamp;
|
|
}
|
|
}
|
|
|
|
// Sort matches by timestamp, most recent first
|
|
if (!empty($exactMatches)) {
|
|
arsort($exactMatches);
|
|
$matchedFile = key($exactMatches);
|
|
logMessage("Using exact match file: $matchedFile");
|
|
} else if (!empty($potentialMatches)) {
|
|
arsort($potentialMatches);
|
|
$matchedFile = key($potentialMatches);
|
|
logMessage("Using potential match file: $matchedFile");
|
|
} else {
|
|
logMessage("No matching files found", 'WARNING');
|
|
$matchedFile = null;
|
|
}
|
|
|
|
// If we found a matching file, try to download it
|
|
if ($matchedFile) {
|
|
$encodedName = rawurlencode($matchedFile);
|
|
$directDownloadUrl = "https://$accountName.blob.core.windows.net/translated-documents/$encodedName?$sasToken";
|
|
logMessage("Direct download URL: " . substr($directDownloadUrl, 0, 100) . "...");
|
|
|
|
// Get file content
|
|
$ch = curl_init();
|
|
curl_setopt($ch, CURLOPT_URL, $directDownloadUrl);
|
|
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
|
|
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
|
|
curl_setopt($ch, CURLOPT_TIMEOUT, 60);
|
|
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
|
|
curl_setopt($ch, CURLOPT_BINARYTRANSFER, true); // Always use binary mode
|
|
|
|
$fileContent = curl_exec($ch);
|
|
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
|
|
$finalContentType = curl_getinfo($ch, CURLINFO_CONTENT_TYPE);
|
|
|
|
logMessage("Download result: HTTP $httpCode, Content-Type: $finalContentType, Size: " . strlen($fileContent) . " bytes");
|
|
|
|
// Store detected content type
|
|
$GLOBALS['detected_content_type'] = $finalContentType;
|
|
$GLOBALS['matched_filename'] = $matchedFile;
|
|
|
|
curl_close($ch);
|
|
}
|
|
}
|
|
} else {
|
|
logMessage("Failed to list container contents. HTTP: $httpCode", 'ERROR');
|
|
}
|
|
|
|
if ($fileContent !== false) {
|
|
logMessage("Successfully downloaded file. Size: " . strlen($fileContent) . " bytes");
|
|
} else {
|
|
logMessage("Failed to download file using direct Azure listing. Falling back to normal method.", 'WARNING');
|
|
// Fall back to normal download method
|
|
$fileContent = false;
|
|
|
|
// Make document ID available globally for the fallback method
|
|
$GLOBALS['document_id'] = $document_id;
|
|
$GLOBALS['original_filename'] = $original_filename;
|
|
|
|
// Use the original download method as fallback
|
|
$fileContent = $azureStorage->downloadTranslatedFile($targetBlobName);
|
|
}
|
|
|
|
// We've already handled the fallback above
|
|
|
|
if ($fileContent === false) {
|
|
logMessage("Failed to download translated file from Azure Storage", 'ERROR');
|
|
|
|
// Try to download from source as fallback
|
|
if (isset($documentKeyData['source_blob'])) {
|
|
logMessage("Attempting to download source file as fallback", 'WARNING');
|
|
$sourceBlobName = $documentKeyData['source_blob'];
|
|
$sourceContent = $azureStorage->downloadTranslatedFile($sourceBlobName);
|
|
|
|
if ($sourceContent !== false) {
|
|
logMessage("Downloaded source file as fallback");
|
|
$fileContent = "[TRANSLATION FAILED - SHOWING ORIGINAL CONTENT]\n\n" . $sourceContent;
|
|
} else {
|
|
echo json_encode(['error' => 'Failed to download translated file and source fallback']);
|
|
exit;
|
|
}
|
|
} else {
|
|
echo json_encode(['error' => 'Failed to download translated file']);
|
|
exit;
|
|
}
|
|
}
|
|
|
|
// If we have file content by this point, proceed with download
|
|
if ($fileContent !== false) {
|
|
// Create new filename with language code prefix
|
|
$fileInfo = pathinfo($original_filename);
|
|
$newFilename = strtoupper($target_lang) . '_' . $fileInfo['filename'] . '.' . $fileInfo['extension'];
|
|
|
|
// Send data to webhook
|
|
$webhookData = [
|
|
'tool' => 'DOCUMENT_TRANSLATION',
|
|
'date' => date('Y-m-d H:i:s'),
|
|
'user' => 'user@example.com', // Replace with actual user information if available
|
|
'model' => 'Microsoft Translator',
|
|
'settings' => 'Document Translation',
|
|
'subTool' => 'Microsoft Translator API',
|
|
'originalFilename' => $original_filename,
|
|
'translatedFilename' => $newFilename,
|
|
'targetLanguage' => $target_lang,
|
|
'documentId' => $document_id,
|
|
'file' => base64_encode($fileContent)
|
|
];
|
|
sendDataToWebhook($webhookData);
|
|
|
|
// Use the matched filename if available to determine content type
|
|
$contentType = 'application/octet-stream'; // Default content type
|
|
$extension = strtolower($fileInfo['extension']);
|
|
|
|
// If we matched a file from Azure storage, use its extension
|
|
if (isset($GLOBALS['matched_filename']) && !empty($GLOBALS['matched_filename'])) {
|
|
$matchedFilename = $GLOBALS['matched_filename'];
|
|
logMessage("Using matched filename for content type detection: $matchedFilename");
|
|
|
|
// Extract extension from matched filename
|
|
if (preg_match('/\.([^\.]+)$/', $matchedFilename, $matchedExtMatches)) {
|
|
$matchedExtension = strtolower($matchedExtMatches[1]);
|
|
logMessage("Matched file extension: $matchedExtension");
|
|
|
|
// If the matched extension is different from requested, adjust the filename
|
|
if ($matchedExtension !== $extension) {
|
|
logMessage("Matched file extension ($matchedExtension) differs from requested ($extension). Adjusting filename.", 'WARNING');
|
|
$newFilename = $fileInfo['filename'] . '.' . $matchedExtension;
|
|
}
|
|
|
|
// Set content type based on matched extension
|
|
switch ($matchedExtension) {
|
|
case 'pdf':
|
|
$contentType = 'application/pdf';
|
|
break;
|
|
case 'docx':
|
|
case 'doc':
|
|
$contentType = 'application/vnd.openxmlformats-officedocument.wordprocessingml.document';
|
|
break;
|
|
case 'pptx':
|
|
case 'ppt':
|
|
$contentType = 'application/vnd.openxmlformats-officedocument.presentationml.presentation';
|
|
break;
|
|
case 'xlsx':
|
|
case 'xls':
|
|
$contentType = 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet';
|
|
break;
|
|
case 'txt':
|
|
$contentType = 'text/plain';
|
|
break;
|
|
case 'html':
|
|
case 'htm':
|
|
$contentType = 'text/html';
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
// Check if we detected a content type from the direct download
|
|
else if (isset($GLOBALS['detected_content_type']) && !empty($GLOBALS['detected_content_type'])) {
|
|
$contentType = $GLOBALS['detected_content_type'];
|
|
logMessage("Using detected Content-Type from server: $contentType");
|
|
|
|
// If Microsoft translated to PDF, but we're requesting a different file type,
|
|
// adjust the filename extension to match the content
|
|
if ($contentType === 'application/pdf' && $extension !== 'pdf') {
|
|
logMessage("Content is PDF but requested extension is $extension. Adjusting filename extension to .pdf", 'WARNING');
|
|
$newFilename = $fileInfo['filename'] . '.pdf';
|
|
}
|
|
}
|
|
// Fallback to the original extension
|
|
else {
|
|
logMessage("Fallback to original file extension: $extension");
|
|
switch ($extension) {
|
|
case 'pdf':
|
|
$contentType = 'application/pdf';
|
|
break;
|
|
case 'docx':
|
|
case 'doc':
|
|
$contentType = 'application/vnd.openxmlformats-officedocument.wordprocessingml.document';
|
|
break;
|
|
case 'pptx':
|
|
case 'ppt':
|
|
$contentType = 'application/vnd.openxmlformats-officedocument.presentationml.presentation';
|
|
break;
|
|
case 'xlsx':
|
|
case 'xls':
|
|
$contentType = 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet';
|
|
break;
|
|
case 'txt':
|
|
$contentType = 'text/plain';
|
|
break;
|
|
case 'html':
|
|
case 'htm':
|
|
$contentType = 'text/html';
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Basic content type detection from file content
|
|
if ($contentType === 'application/octet-stream') {
|
|
// Try to detect content type from file contents
|
|
if (substr($fileContent, 0, 4) === '%PDF') {
|
|
logMessage("PDF signature detected in content, setting content type to application/pdf", 'WARNING');
|
|
$contentType = 'application/pdf';
|
|
// If the file doesn't have .pdf extension, adjust it
|
|
if ($extension !== 'pdf') {
|
|
$newFilename = $fileInfo['filename'] . '.pdf';
|
|
}
|
|
} else if (substr($fileContent, 0, 4) === 'PK\x03\x04') {
|
|
// Office files are zip files
|
|
if (strpos($original_filename, '.pptx') !== false || strpos($original_filename, '.ppt') !== false) {
|
|
$contentType = 'application/vnd.openxmlformats-officedocument.presentationml.presentation';
|
|
} else if (strpos($original_filename, '.docx') !== false || strpos($original_filename, '.doc') !== false) {
|
|
$contentType = 'application/vnd.openxmlformats-officedocument.wordprocessingml.document';
|
|
} else if (strpos($original_filename, '.xlsx') !== false || strpos($original_filename, '.xls') !== false) {
|
|
$contentType = 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet';
|
|
}
|
|
}
|
|
}
|
|
|
|
// For binary files like PDF and PowerPoint, ensure proper handling
|
|
$isBinary = in_array($extension, ['pdf', 'pptx', 'ppt', 'docx', 'doc', 'xlsx', 'xls']);
|
|
|
|
// Check for valid binary content for PDFs and PowerPoint files
|
|
if ($isBinary) {
|
|
// Basic binary file validation - check for file signatures
|
|
if ($extension === 'pdf' && substr($fileContent, 0, 4) !== '%PDF') {
|
|
logMessage("Warning: PDF file doesn't have proper signature", 'WARNING');
|
|
} else if (($extension === 'pptx' || $extension === 'docx') && substr($fileContent, 0, 4) !== 'PK\x03\x04') {
|
|
logMessage("Warning: Office file doesn't have proper ZIP signature", 'WARNING');
|
|
}
|
|
|
|
// Ensure we're not sending text content for binary files
|
|
if (preg_match('/^\[EMERGENCY FALLBACK/', $fileContent)) {
|
|
logMessage("Error: Emergency fallback content detected for binary file", 'ERROR');
|
|
// Still proceed with download, but notify in logs
|
|
}
|
|
|
|
// Log file size details
|
|
logMessage("Binary file download: $contentType, size: " . strlen($fileContent) . " bytes");
|
|
}
|
|
|
|
// Send the file to the client
|
|
header("Content-Type: $contentType");
|
|
header("Content-Disposition: attachment; filename=\"$newFilename\"");
|
|
header("Content-Length: " . strlen($fileContent));
|
|
header("Cache-Control: no-cache, must-revalidate");
|
|
header("Pragma: no-cache");
|
|
|
|
echo $fileContent;
|
|
logMessage("File download initiated: $newFilename");
|
|
} else {
|
|
echo json_encode(['error' => 'Failed to retrieve file content']);
|
|
}
|
|
} else {
|
|
logMessage("Missing required parameters", 'ERROR');
|
|
echo json_encode(['error' => 'Missing required parameters (document_id, document_key, original_filename, or target_lang)']);
|
|
}
|
|
|
|
function sendDataToWebhook($data)
|
|
{
|
|
if (!defined('DATASTORE_WEBHOOK') || empty(DATASTORE_WEBHOOK)) {
|
|
logMessage('Webhook URL not defined. Skipping webhook notification.', 'WARNING');
|
|
return;
|
|
}
|
|
|
|
$url = DATASTORE_WEBHOOK;
|
|
$headers = ['Content-Type: application/json'];
|
|
$payload = json_encode($data);
|
|
|
|
$ch = curl_init($url);
|
|
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
|
|
curl_setopt($ch, CURLOPT_POST, true);
|
|
curl_setopt($ch, CURLOPT_POSTFIELDS, $payload);
|
|
curl_setopt($ch, CURLOPT_HTTPHEADER, $headers);
|
|
|
|
$response = curl_exec($ch);
|
|
$statusCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
|
|
$error = curl_error($ch);
|
|
|
|
curl_close($ch);
|
|
|
|
if ($statusCode === 200) {
|
|
logMessage('Data sent to webhook successfully');
|
|
} else {
|
|
logMessage('Error sending data to webhook: ' . $error, 'ERROR');
|
|
}
|
|
}
|
|
?>
|