#step1 assembly
# Run the software on a Linux system
# Download data from MG-RAST (http://www.mg-rast.org/) by searching the key words “soil” and “environment”, and use raw data as the started data.
# Data processing using Metawrap
$ wget https://repo.continuum.io/miniconda/Miniconda2-latest-Linux-x86_64.sh
$ bash Miniconda2-latest-Linux-x86_64.sh
$ conda config --add channels defaults
$ conda config --add channels conda-forge
$ conda config --add channels bioconda
$ conda config --add channels ursky
$ conda install -y -c ursky MetaWrap-mg
$ conda install -y blas=2.5=mkl
# Metawrap install successfully
$ MetaWrap assembly -1 data_1.fasta -2 data_2.fasta -o output _dir
# After running the code above, you will get a folder of result. The folder includes four part, and among them .fasta file is the assembled sequence.
# change the name of assembly.fasta into dirn.fasta, and then use this sequence for local blast step.
#step2
#Run the software based on Windows system
Download ncbi-blast-2.9.0+-x64-win64.tar.gz from https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/ 
#It depends on the version of your computer 
Install this software and add an environment variable BLASTDB: C:\Blast\db ,and add system variable variables:Path:C:\Blast\bin
C:\Users\**>cd C:\blast
C:\Blast>blastn -version
#blastn: 2.9.0+Package: blast 2.9.0, build Mar 11 2019 15:18:27
C:\Blast\db> makeblastdb -in CopA_reviewed.fasta -dbtype prot -title " CopA_uniprot_reviewed" -out NR
# In this particular situation, copA protein sequences with known function reported in Uniprot are selected as database.We already upload this sequence file.
C:\blast\db>blastx.exe -db NR -query dirn.fasta -out dirn.out -evalue 0.000001 -max_target_seqs 5 -num_threads 4 -outfmt 6
#This step is used for nucleotide sequences blasting
C:\blast\db>blastp.exe -db NR -query dirp.fasta -out dirp.out -evalue 0.000001 -max_target_seqs 5 -num_threads 4 -outfmt 6
#This step is used for protein sequences blasting
#Use NR as the local database, and dirn/dirp.fasta as the compare database
#step3
# Search the potential CopA genes using NCBI ORF finder
# Website: https://www.ncbi.nlm.nih.gov/orfinder/
# Since the length of copA amino acid sequences are between 500-800, ORFs in this  range are selected as the target data.
# Using the TMHMM (http://www.cbs.dtu.dk/services/TMHMM-2.0/) online analysis platform to predict the transmembrane helices .
# Functional domains were then predicted using Pfam (http://pfam.xfam.org/) 
# Phylogenetic analysis
# Analyze the similarity between ORF and the reported copA gene by constructing evolutionary tree using MRGE 7.0. Select sequences that meet the requirements and move on to functional verification.