Week Overview
This week focuses on file operations and data processing—critical skills for security automation. You'll learn to:
- Read and write files efficiently (wordlists, configs, reports)
- Parse structured data formats (CSV, JSON, XML)
- Use regular expressions for pattern matching and extraction
- Generate professional security reports
- Process large datasets with best practices
Section 1: File Operations
Reading Files
Python provides multiple ways to read files. The modern approach uses context managers (with statements) to ensure files are properly closed:
#!/usr/bin/env python3
"""
File reading examples for security work
"""
# Method 1: Read entire file (small files only)
def read_wordlist(filepath: str) -> list[str]:
"""
Read a wordlist file and return as list of strings.
Args:
filepath: Path to wordlist file
Returns:
List of words (stripped of whitespace)
"""
with open(filepath, 'r') as f:
# Read all lines, strip whitespace, filter empty lines
words = [line.strip() for line in f if line.strip()]
return words
# Usage
passwords = read_wordlist('/usr/share/wordlists/rockyou.txt')
print(f"Loaded {len(passwords)} passwords")
# Method 2: Read line by line (large files)
def count_failed_logins(log_file: str) -> int:
"""
Count failed login attempts in log file.
Memory-efficient for large files.
"""
count = 0
with open(log_file, 'r') as f:
for line in f: # Reads one line at a time
if "Failed password" in line:
count += 1
return count
# Method 3: Read in chunks (binary files)
def calculate_file_hash(filepath: str) -> str:
"""
Calculate SHA256 hash of file (works for any size).
"""
import hashlib
sha256 = hashlib.sha256()
with open(filepath, 'rb') as f: # 'rb' = read binary
# Read in 64KB chunks
while chunk := f.read(65536):
sha256.update(chunk)
return sha256.hexdigest()
# Error handling for file operations
def safe_read_config(config_file: str) -> str | None:
"""
Safely read config file with error handling.
"""
try:
with open(config_file, 'r') as f:
return f.read()
except FileNotFoundError:
print(f"❌ Config file not found: {config_file}")
return None
except PermissionError:
print(f"❌ Permission denied: {config_file}")
return None
except Exception as e:
print(f"❌ Error reading {config_file}: {e}")
return None
Writing Files
Writing files follows similar patterns. Always use context managers to ensure data is flushed to disk:
#!/usr/bin/env python3
"""
File writing examples for security reports and data export
"""
# Method 1: Write text file
def save_scan_results(results: dict, output_file: str) -> None:
"""
Save scan results to text file.
Args:
results: Dictionary of scan results
output_file: Path to output file
"""
with open(output_file, 'w') as f:
f.write("=== Network Scan Results ===\n\n")
for host, ports in results.items():
f.write(f"Host: {host}\n")
f.write(f"Open Ports: {', '.join(map(str, ports))}\n")
f.write("-" * 40 + "\n")
print(f"✅ Results saved to {output_file}")
# Method 2: Append to log file
def log_scan_activity(message: str, log_file: str = "scan.log") -> None:
"""
Append scan activity to log file.
"""
from datetime import datetime
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
log_entry = f"[{timestamp}] {message}\n"
with open(log_file, 'a') as f: # 'a' = append mode
f.write(log_entry)
# Method 3: Write binary data
def save_network_capture(packets: bytes, pcap_file: str) -> None:
"""
Save raw packet data to file.
"""
with open(pcap_file, 'wb') as f: # 'wb' = write binary
f.write(packets)
print(f"✅ Captured {len(packets)} bytes to {pcap_file}")
# Example: Generate HTML report
def generate_html_report(scan_data: dict, output_file: str) -> None:
"""
Generate professional HTML security report.
"""
html = f"""
Security Scan Report
Network Scan Report
Generated: {scan_data.get('timestamp', 'N/A')}
"""
for host, info in scan_data.get('hosts', {}).items():
html += f"""
{host}
Open Ports: {', '.join(map(str, info['ports']))}
"""
html += """
"""
with open(output_file, 'w') as f:
f.write(html)
print(f"✅ HTML report saved to {output_file}")
Working with Paths (pathlib)
The pathlib module provides object-oriented file path handling, superior to string concatenation:
#!/usr/bin/env python3
"""
Modern path handling with pathlib
"""
from pathlib import Path
# Create path objects
wordlists_dir = Path("/usr/share/wordlists")
rockyou = wordlists_dir / "rockyou.txt" # Clean path joining
# Check existence
if rockyou.exists():
print(f"✅ Found: {rockyou}")
print(f"Size: {rockyou.stat().st_size / 1024 / 1024:.2f} MB")
# Iterate directory
for wordlist in wordlists_dir.glob("*.txt"):
print(f"Wordlist: {wordlist.name}")
# Create directories
reports_dir = Path("./scan_reports")
reports_dir.mkdir(exist_ok=True) # Create if doesn't exist
# Generate unique output filename
from datetime import datetime
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
report_file = reports_dir / f"scan_{timestamp}.html"
# Read/write with Path objects
config = Path("config.txt")
if config.exists():
content = config.read_text() # Shorthand for open/read/close
# Write with Path
report_file.write_text("<h1>Report</h1>")
# Check file properties
if report_file.is_file():
print(f"File size: {report_file.stat().st_size} bytes")
print(f"Modified: {datetime.fromtimestamp(report_file.stat().st_mtime)}")
Section 2: Parsing Structured Data
JSON - JavaScript Object Notation
JSON is the most common format for APIs, configuration files, and structured security data:
#!/usr/bin/env python3
"""
Working with JSON data in security contexts
"""
import json
from pathlib import Path
# Example: Parse CVE database
cve_data = '''
{
"CVE-2021-44228": {
"description": "Log4Shell - Remote Code Execution in Apache Log4j",
"severity": "CRITICAL",
"cvss_score": 10.0,
"affected_versions": ["2.0-beta9", "2.15.0"],
"published": "2021-12-10"
},
"CVE-2014-0160": {
"description": "Heartbleed - OpenSSL TLS Heartbeat Extension Information Disclosure",
"severity": "HIGH",
"cvss_score": 7.5,
"affected_versions": ["1.0.1", "1.0.1f"],
"published": "2014-04-07"
}
}
'''
# Parse JSON string to Python dictionary
cve_db = json.loads(cve_data)
# Access data
for cve_id, details in cve_db.items():
if details['severity'] == 'CRITICAL':
print(f"🚨 {cve_id}: {details['description']}")
print(f" CVSS: {details['cvss_score']}")
# Read JSON from file
def load_cve_database(filepath: str) -> dict:
"""
Load CVE database from JSON file.
"""
with open(filepath, 'r') as f:
return json.load(f)
# Write JSON to file
def save_scan_results_json(results: dict, output_file: str) -> None:
"""
Save scan results as JSON with pretty formatting.
"""
with open(output_file, 'w') as f:
json.dump(results, f, indent=2)
print(f"✅ Results saved to {output_file}")
# Example: Parse nmap JSON output
def parse_nmap_json(nmap_file: str) -> dict:
"""
Parse nmap JSON output (-oJ flag).
Returns dictionary of hosts and open ports.
"""
with open(nmap_file, 'r') as f:
nmap_data = json.load(f)
results = {}
for host in nmap_data.get('nmaprun', {}).get('host', []):
ip = host.get('address', {}).get('addr', 'Unknown')
ports = []
for port in host.get('ports', {}).get('port', []):
if port.get('state', {}).get('state') == 'open':
ports.append({
'port': port.get('portid'),
'service': port.get('service', {}).get('name', 'unknown')
})
results[ip] = ports
return results
# Practical example: Vulnerability scanner with JSON config
class VulnScanner:
"""
Vulnerability scanner configured via JSON file.
"""
def __init__(self, config_file: str):
"""Load scanner configuration from JSON."""
with open(config_file, 'r') as f:
self.config = json.load(f)
self.targets = self.config.get('targets', [])
self.ports = self.config.get('ports', [80, 443])
self.timeout = self.config.get('timeout', 2)
def scan(self) -> dict:
"""Run scan based on config."""
results = {
'timestamp': str(datetime.now()),
'config': self.config,
'hosts': {}
}
# Scan logic here...
return results
# Example config.json:
# {
# "targets": ["192.168.1.1", "192.168.1.10"],
# "ports": [21, 22, 80, 443, 3389],
# "timeout": 3,
# "output": "scan_results.json"
# }
CSV - Comma-Separated Values
CSV is common for tabular security data (scan results, logs, vulnerability exports):
#!/usr/bin/env python3
"""
Working with CSV data in security contexts
"""
import csv
from pathlib import Path
# Read CSV file
def parse_vulnerability_report(csv_file: str) -> list[dict]:
"""
Parse vulnerability scan CSV export.
Expected format:
Host,Port,Service,Vulnerability,Severity,CVSS
"""
vulnerabilities = []
with open(csv_file, 'r') as f:
reader = csv.DictReader(f) # Returns dict per row
for row in reader:
# Filter critical/high only
if row['Severity'] in ['Critical', 'High']:
vulnerabilities.append({
'host': row['Host'],
'port': int(row['Port']),
'service': row['Service'],
'vuln': row['Vulnerability'],
'severity': row['Severity'],
'cvss': float(row['CVSS'])
})
return vulnerabilities
# Write CSV file
def export_scan_results_csv(results: list[dict], output_file: str) -> None:
"""
Export scan results to CSV format.
"""
fieldnames = ['Host', 'Port', 'State', 'Service', 'Banner']
with open(output_file, 'w', newline='') as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader() # Write column headers
for result in results:
writer.writerow({
'Host': result['host'],
'Port': result['port'],
'State': result.get('state', 'open'),
'Service': result.get('service', 'unknown'),
'Banner': result.get('banner', '')
})
print(f"✅ Exported {len(results)} results to {output_file}")
# Example: Process firewall logs
def analyze_firewall_logs(log_csv: str) -> dict:
"""
Analyze firewall logs from CSV export.
Returns statistics on blocked connections.
"""
stats = {
'total_blocks': 0,
'top_blocked_ips': {},
'top_blocked_ports': {}
}
with open(log_csv, 'r') as f:
reader = csv.DictReader(f)
for row in reader:
if row['Action'] == 'BLOCK':
stats['total_blocks'] += 1
# Count blocked IPs
src_ip = row['Source_IP']
stats['top_blocked_ips'][src_ip] = \
stats['top_blocked_ips'].get(src_ip, 0) + 1
# Count blocked ports
dst_port = row['Dest_Port']
stats['top_blocked_ports'][dst_port] = \
stats['top_blocked_ports'].get(dst_port, 0) + 1
return stats
XML Parsing
Many security tools output XML (nmap, vulnerability scanners). Python's xml.etree.ElementTree handles XML parsing:
#!/usr/bin/env python3
"""
Parsing XML security data (nmap, Nessus, etc.)
"""
import xml.etree.ElementTree as ET
def parse_nmap_xml(xml_file: str) -> dict:
"""
Parse nmap XML output (-oX flag).
Returns:
Dictionary of hosts with open ports and services
"""
tree = ET.parse(xml_file)
root = tree.getroot()
results = {}
# Iterate through each host
for host in root.findall('host'):
# Get IP address
address = host.find('address').get('addr')
# Get hostname if available
hostnames = host.find('hostnames')
hostname = None
if hostnames is not None:
hostname_elem = hostnames.find('hostname')
if hostname_elem is not None:
hostname = hostname_elem.get('name')
# Get open ports
ports_data = []
ports = host.find('ports')
if ports is not None:
for port in ports.findall('port'):
state = port.find('state').get('state')
if state == 'open':
port_id = port.get('portid')
protocol = port.get('protocol')
service = port.find('service')
service_name = service.get('name', 'unknown') if service is not None else 'unknown'
ports_data.append({
'port': port_id,
'protocol': protocol,
'service': service_name
})
results[address] = {
'hostname': hostname,
'ports': ports_data
}
return results
# Usage
nmap_results = parse_nmap_xml('scan_results.xml')
for ip, data in nmap_results.items():
print(f"\n🎯 {ip} ({data['hostname'] or 'N/A'})")
for port_info in data['ports']:
print(f" {port_info['port']}/{port_info['protocol']} - {port_info['service']}")
Section 3: Regular Expressions (Regex)
Regular expressions are essential for extracting structured data from unstructured text (logs, banners, responses):