msft-trns/download-old.php
2026-03-02 17:21:57 +00:00

444 lines
No EOL
20 KiB
PHP

<?php
include 'config.php';
include 'logger.php';
include 'azure_storage.php';
logMessage("Received request to download.php");
if (isset($_GET['document_id']) && isset($_GET['document_key']) && isset($_GET['original_filename']) && isset($_GET['target_lang'])) {
$document_id = $_GET['document_id'];
$document_key = $_GET['document_key'];
$original_filename = $_GET['original_filename'];
$target_lang = $_GET['target_lang'];
logMessage("Attempting to download Document ID: $document_id, Original filename: $original_filename, Target language: $target_lang");
// Decode the document key to get blob information
$documentKeyData = json_decode($document_key, true);
if (json_last_error() !== JSON_ERROR_NONE) {
logMessage("Invalid document_key format", 'ERROR');
echo json_encode(['error' => 'Invalid document_key format']);
exit;
}
// Make sure we have the target blob name
if (!isset($documentKeyData['target_blob'])) {
logMessage("Missing target_blob in document_key", 'ERROR');
echo json_encode(['error' => 'Missing target blob information']);
exit;
}
$targetBlobName = $documentKeyData['target_blob'];
// Set timeout for downloads
set_time_limit(60);
// Variable to store file content
$fileContent = false;
// Initialize Azure Storage Helper
$azureStorage = new AzureStorageHelper();
// Before downloading, check the status of the translation
$statusUrl = MS_API_ENDPOINT . "/translator/document/batches/$document_id?api-version=" . MS_API_VERSION;
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $statusUrl);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_HTTPHEADER, [
'Ocp-Apim-Subscription-Key: ' . MS_API_KEY,
'Ocp-Apim-Subscription-Region: ' . MS_API_REGION
]);
$response = curl_exec($ch);
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
curl_close($ch);
if ($httpCode !== 200 || $response === false) {
logMessage("Failed to check translation status. HTTP Code: $httpCode", 'ERROR');
} else {
$statusData = json_decode($response, true);
if (isset($statusData['status']) && strtolower($statusData['status']) !== 'succeeded') {
logMessage("Translation status is not Succeeded: " . $statusData['status'] . ". Attempting download anyway.", 'WARNING');
} else {
logMessage("Translation has Succeeded status. Proceeding with download.");
}
}
// DIRECT APPROACH: List all files in the Azure translated container and match to our target
logMessage("USING DIRECT AZURE LISTING to find files for document ID: $document_id");
// Extract the target language and other needed info
$fileExtension = '';
if (preg_match('/\.([^\.]+)$/', $original_filename, $extMatches)) {
$fileExtension = strtolower($extMatches[1]);
}
logMessage("Original filename: $original_filename, Extension: $fileExtension, Target Language: $target_lang");
// Get info from document key
$documentKeyData = json_decode($document_key, true);
$sourceBlobName = isset($documentKeyData['source_blob']) ? $documentKeyData['source_blob'] : '';
$targetBlobName = isset($documentKeyData['target_blob']) ? $documentKeyData['target_blob'] : '';
logMessage("Source blob: $sourceBlobName, Target blob: $targetBlobName");
// Extract the basic filename without our added prefix
$baseFilename = '';
if (preg_match('/^.*?\-([A-Z]+)_(.+)$/', $targetBlobName, $matches)) {
$baseFilename = $matches[2]; // This is just the original filename
logMessage("Base filename extracted from target blob: $baseFilename");
}
// List all files in the translated container
$sasToken = defined('AZURE_STORAGE_SAS_TOKEN') ? AZURE_STORAGE_SAS_TOKEN : '';
$accountName = 'opticaltranslations'; // Hardcoding for simplicity
$containerUrl = "https://$accountName.blob.core.windows.net/translated-documents?restype=container&comp=list&$sasToken";
logMessage("Listing all files in translated container");
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $containerUrl);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
$listResponse = curl_exec($ch);
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
curl_close($ch);
$fileContent = false;
$directDownloadUrl = null;
if ($listResponse !== false && $httpCode < 400) {
logMessage("Successfully listed container contents");
// Parse XML
$xml = simplexml_load_string($listResponse);
if ($xml && isset($xml->Blobs->Blob)) {
$potentialMatches = [];
$exactMatches = [];
// Go through all blobs and find potential matches
foreach ($xml->Blobs->Blob as $blob) {
$name = (string)$blob->Name;
$lastModified = (string)$blob->Properties->{'Last-Modified'};
$timestamp = strtotime($lastModified);
$size = (int)$blob->Properties->{'Content-Length'};
logMessage("Found blob: $name (size: $size bytes, modified: $lastModified)");
// Check for exact match - this is the file we uploaded
if ($name === $targetBlobName) {
logMessage("EXACT MATCH FOUND: $name");
$exactMatches[$name] = $timestamp;
}
// Check for matching language code
if ((strpos($name, strtolower($target_lang)) !== false ||
strpos($name, strtoupper($target_lang)) !== false) &&
$size > 0) {
logMessage("Language match found: $name");
$potentialMatches[$name] = $timestamp;
}
// Check for matching original filename (ignoring language prefix)
if (!empty($baseFilename) && strpos($name, $baseFilename) !== false && $size > 0) {
logMessage("Filename match found: $name");
$potentialMatches[$name] = $timestamp;
}
// Check for matching file extension
if (!empty($fileExtension) &&
(strpos($name, ".$fileExtension") !== false) &&
$size > 0) {
logMessage("Extension match found: $name");
$potentialMatches[$name] = $timestamp;
}
}
// Sort matches by timestamp, most recent first
if (!empty($exactMatches)) {
arsort($exactMatches);
$matchedFile = key($exactMatches);
logMessage("Using exact match file: $matchedFile");
} else if (!empty($potentialMatches)) {
arsort($potentialMatches);
$matchedFile = key($potentialMatches);
logMessage("Using potential match file: $matchedFile");
} else {
logMessage("No matching files found", 'WARNING');
$matchedFile = null;
}
// If we found a matching file, try to download it
if ($matchedFile) {
$encodedName = rawurlencode($matchedFile);
$directDownloadUrl = "https://$accountName.blob.core.windows.net/translated-documents/$encodedName?$sasToken";
logMessage("Direct download URL: " . substr($directDownloadUrl, 0, 100) . "...");
// Get file content
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $directDownloadUrl);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_TIMEOUT, 60);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_BINARYTRANSFER, true); // Always use binary mode
$fileContent = curl_exec($ch);
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
$finalContentType = curl_getinfo($ch, CURLINFO_CONTENT_TYPE);
logMessage("Download result: HTTP $httpCode, Content-Type: $finalContentType, Size: " . strlen($fileContent) . " bytes");
// Store detected content type
$GLOBALS['detected_content_type'] = $finalContentType;
$GLOBALS['matched_filename'] = $matchedFile;
curl_close($ch);
}
}
} else {
logMessage("Failed to list container contents. HTTP: $httpCode", 'ERROR');
}
if ($fileContent !== false) {
logMessage("Successfully downloaded file. Size: " . strlen($fileContent) . " bytes");
} else {
logMessage("Failed to download file using direct Azure listing. Falling back to normal method.", 'WARNING');
// Fall back to normal download method
$fileContent = false;
// Make document ID available globally for the fallback method
$GLOBALS['document_id'] = $document_id;
$GLOBALS['original_filename'] = $original_filename;
// Use the original download method as fallback
$fileContent = $azureStorage->downloadTranslatedFile($targetBlobName);
}
// We've already handled the fallback above
if ($fileContent === false) {
logMessage("Failed to download translated file from Azure Storage", 'ERROR');
// Try to download from source as fallback
if (isset($documentKeyData['source_blob'])) {
logMessage("Attempting to download source file as fallback", 'WARNING');
$sourceBlobName = $documentKeyData['source_blob'];
$sourceContent = $azureStorage->downloadTranslatedFile($sourceBlobName);
if ($sourceContent !== false) {
logMessage("Downloaded source file as fallback");
$fileContent = "[TRANSLATION FAILED - SHOWING ORIGINAL CONTENT]\n\n" . $sourceContent;
} else {
echo json_encode(['error' => 'Failed to download translated file and source fallback']);
exit;
}
} else {
echo json_encode(['error' => 'Failed to download translated file']);
exit;
}
}
// If we have file content by this point, proceed with download
if ($fileContent !== false) {
// Create new filename with language code prefix
$fileInfo = pathinfo($original_filename);
$newFilename = strtoupper($target_lang) . '_' . $fileInfo['filename'] . '.' . $fileInfo['extension'];
// Send data to webhook
$webhookData = [
'tool' => 'DOCUMENT_TRANSLATION',
'date' => date('Y-m-d H:i:s'),
'user' => 'user@example.com', // Replace with actual user information if available
'model' => 'Microsoft Translator',
'settings' => 'Document Translation',
'subTool' => 'Microsoft Translator API',
'originalFilename' => $original_filename,
'translatedFilename' => $newFilename,
'targetLanguage' => $target_lang,
'documentId' => $document_id,
'file' => base64_encode($fileContent)
];
sendDataToWebhook($webhookData);
// Use the matched filename if available to determine content type
$contentType = 'application/octet-stream'; // Default content type
$extension = strtolower($fileInfo['extension']);
// If we matched a file from Azure storage, use its extension
if (isset($GLOBALS['matched_filename']) && !empty($GLOBALS['matched_filename'])) {
$matchedFilename = $GLOBALS['matched_filename'];
logMessage("Using matched filename for content type detection: $matchedFilename");
// Extract extension from matched filename
if (preg_match('/\.([^\.]+)$/', $matchedFilename, $matchedExtMatches)) {
$matchedExtension = strtolower($matchedExtMatches[1]);
logMessage("Matched file extension: $matchedExtension");
// If the matched extension is different from requested, adjust the filename
if ($matchedExtension !== $extension) {
logMessage("Matched file extension ($matchedExtension) differs from requested ($extension). Adjusting filename.", 'WARNING');
$newFilename = $fileInfo['filename'] . '.' . $matchedExtension;
}
// Set content type based on matched extension
switch ($matchedExtension) {
case 'pdf':
$contentType = 'application/pdf';
break;
case 'docx':
case 'doc':
$contentType = 'application/vnd.openxmlformats-officedocument.wordprocessingml.document';
break;
case 'pptx':
case 'ppt':
$contentType = 'application/vnd.openxmlformats-officedocument.presentationml.presentation';
break;
case 'xlsx':
case 'xls':
$contentType = 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet';
break;
case 'txt':
$contentType = 'text/plain';
break;
case 'html':
case 'htm':
$contentType = 'text/html';
break;
}
}
}
// Check if we detected a content type from the direct download
else if (isset($GLOBALS['detected_content_type']) && !empty($GLOBALS['detected_content_type'])) {
$contentType = $GLOBALS['detected_content_type'];
logMessage("Using detected Content-Type from server: $contentType");
// If Microsoft translated to PDF, but we're requesting a different file type,
// adjust the filename extension to match the content
if ($contentType === 'application/pdf' && $extension !== 'pdf') {
logMessage("Content is PDF but requested extension is $extension. Adjusting filename extension to .pdf", 'WARNING');
$newFilename = $fileInfo['filename'] . '.pdf';
}
}
// Fallback to the original extension
else {
logMessage("Fallback to original file extension: $extension");
switch ($extension) {
case 'pdf':
$contentType = 'application/pdf';
break;
case 'docx':
case 'doc':
$contentType = 'application/vnd.openxmlformats-officedocument.wordprocessingml.document';
break;
case 'pptx':
case 'ppt':
$contentType = 'application/vnd.openxmlformats-officedocument.presentationml.presentation';
break;
case 'xlsx':
case 'xls':
$contentType = 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet';
break;
case 'txt':
$contentType = 'text/plain';
break;
case 'html':
case 'htm':
$contentType = 'text/html';
break;
}
}
// Basic content type detection from file content
if ($contentType === 'application/octet-stream') {
// Try to detect content type from file contents
if (substr($fileContent, 0, 4) === '%PDF') {
logMessage("PDF signature detected in content, setting content type to application/pdf", 'WARNING');
$contentType = 'application/pdf';
// If the file doesn't have .pdf extension, adjust it
if ($extension !== 'pdf') {
$newFilename = $fileInfo['filename'] . '.pdf';
}
} else if (substr($fileContent, 0, 4) === 'PK\x03\x04') {
// Office files are zip files
if (strpos($original_filename, '.pptx') !== false || strpos($original_filename, '.ppt') !== false) {
$contentType = 'application/vnd.openxmlformats-officedocument.presentationml.presentation';
} else if (strpos($original_filename, '.docx') !== false || strpos($original_filename, '.doc') !== false) {
$contentType = 'application/vnd.openxmlformats-officedocument.wordprocessingml.document';
} else if (strpos($original_filename, '.xlsx') !== false || strpos($original_filename, '.xls') !== false) {
$contentType = 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet';
}
}
}
// For binary files like PDF and PowerPoint, ensure proper handling
$isBinary = in_array($extension, ['pdf', 'pptx', 'ppt', 'docx', 'doc', 'xlsx', 'xls']);
// Check for valid binary content for PDFs and PowerPoint files
if ($isBinary) {
// Basic binary file validation - check for file signatures
if ($extension === 'pdf' && substr($fileContent, 0, 4) !== '%PDF') {
logMessage("Warning: PDF file doesn't have proper signature", 'WARNING');
} else if (($extension === 'pptx' || $extension === 'docx') && substr($fileContent, 0, 4) !== 'PK\x03\x04') {
logMessage("Warning: Office file doesn't have proper ZIP signature", 'WARNING');
}
// Ensure we're not sending text content for binary files
if (preg_match('/^\[EMERGENCY FALLBACK/', $fileContent)) {
logMessage("Error: Emergency fallback content detected for binary file", 'ERROR');
// Still proceed with download, but notify in logs
}
// Log file size details
logMessage("Binary file download: $contentType, size: " . strlen($fileContent) . " bytes");
}
// Send the file to the client
header("Content-Type: $contentType");
header("Content-Disposition: attachment; filename=\"$newFilename\"");
header("Content-Length: " . strlen($fileContent));
header("Cache-Control: no-cache, must-revalidate");
header("Pragma: no-cache");
echo $fileContent;
logMessage("File download initiated: $newFilename");
} else {
echo json_encode(['error' => 'Failed to retrieve file content']);
}
} else {
logMessage("Missing required parameters", 'ERROR');
echo json_encode(['error' => 'Missing required parameters (document_id, document_key, original_filename, or target_lang)']);
}
function sendDataToWebhook($data)
{
if (!defined('DATASTORE_WEBHOOK') || empty(DATASTORE_WEBHOOK)) {
logMessage('Webhook URL not defined. Skipping webhook notification.', 'WARNING');
return;
}
$url = DATASTORE_WEBHOOK;
$headers = ['Content-Type: application/json'];
$payload = json_encode($data);
$ch = curl_init($url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_POST, true);
curl_setopt($ch, CURLOPT_POSTFIELDS, $payload);
curl_setopt($ch, CURLOPT_HTTPHEADER, $headers);
$response = curl_exec($ch);
$statusCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
$error = curl_error($ch);
curl_close($ch);
if ($statusCode === 200) {
logMessage('Data sent to webhook successfully');
} else {
logMessage('Error sending data to webhook: ' . $error, 'ERROR');
}
}
?>